From 30348b275325b6c6c74dc36025c7e503aa01de5b Mon Sep 17 00:00:00 2001 From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com> Date: Wed, 28 Jan 2026 10:17:51 +0800 Subject: [PATCH] [None][fix] Proper conditional compilation of sm10x cubins (#10839) Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com> --- .cmake-format.json | 284 + cpp/cmake/modules/cuda_configuration.cmake | 453 +- cpp/tensorrt_llm/common/CMakeLists.txt | 1 + cpp/tensorrt_llm/kernels/CMakeLists.txt | 51 +- .../CMakeLists.txt | 70 +- .../CMakeLists.txt | 19 +- .../batchedGemm/CMakeLists.txt | 8 +- .../BatchedGemmInterface.h | 3 +- .../trtllmGen_bmm_export/KernelMetaInfo.h | 16910 ++++++++-------- .../trtllmGen_bmm_export/KernelTraits.h | 8 +- .../trtllmGenKernels/fmha/CMakeLists.txt | 6 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...maxQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...xQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...xQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...maxQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...maxQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 2 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...maxQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...xQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...xQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...maxQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...maxQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...SoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp} | 4 +- ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 2 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 2 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 - ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 - ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...tmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...xQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...ftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...xQ32Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...ftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...xQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 3 + ...ftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...axQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...oftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...oftmaxQ128Kv128PersistentContext_cubin.cpp | 4 +- ...ipsSoftmaxQ128Kv128StaticContext_cubin.cpp | 4 +- .../fmha/cubin/kernelMetaInfo.h | 13194 ++++++------ .../trtllmGenKernels/fmha/fmhaKernels.h | 4 +- .../trtllmGenKernels/gemm/CMakeLists.txt | 9 +- .../trtllmGen_gemm_export/GemmInterface.h | 2 +- .../trtllmGen_gemm_export/KernelMetaInfo.h | 9530 ++++----- .../gemm/trtllmGen_gemm_export/KernelTraits.h | 2 +- .../gemmGatedAct/CMakeLists.txt | 8 +- .../GemmGatedActInterface.h | 3 +- .../KernelMetaInfo.h | 38 +- .../trtllmGen_gatedAct_export/KernelTraits.h | 2 +- 2902 files changed, 27189 insertions(+), 24336 deletions(-) create mode 100644 .cmake-format.json create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp rename cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/{FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp => FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp} (81%) create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp diff --git a/.cmake-format.json b/.cmake-format.json new file mode 100644 index 0000000000..1bb7cdfa7b --- /dev/null +++ b/.cmake-format.json @@ -0,0 +1,284 @@ +{ + "_help_parse": "Options affecting listfile parsing", + "parse": { + "_help_additional_commands": [ + "Specify structure for custom cmake functions" + ], + "additional_commands": { + "filter_source_cuda_architectures": { + "flags": [ + "IMPLICIT_FAMILY" + ], + "kwargs": { + "SOURCE_LIST": "1", + "TARGET": "1", + "ARCHS": "+" + } + } + }, + "_help_vartags": [ + "Specify variable tags." + ], + "vartags": [], + "_help_proptags": [ + "Specify property tags." + ], + "proptags": [] + }, + "_help_format": "Options affecting formatting.", + "format": { + "_help_line_width": [ + "How wide to allow formatted cmake files" + ], + "line_width": 80, + "_help_tab_size": [ + "How many spaces to tab for indent" + ], + "tab_size": 2, + "_help_max_subgroups_hwrap": [ + "If an argument group contains more than this many sub-groups", + "(parg or kwarg groups) then force it to a vertical layout." + ], + "max_subgroups_hwrap": 2, + "_help_max_pargs_hwrap": [ + "If a positional argument group contains more than this many", + "arguments, then force it to a vertical layout." + ], + "max_pargs_hwrap": 6, + "_help_max_rows_cmdline": [ + "If a cmdline positional group consumes more than this many", + "lines without nesting, then invalidate the layout (and nest)" + ], + "max_rows_cmdline": 2, + "_help_separate_ctrl_name_with_space": [ + "If true, separate flow control names from their parentheses", + "with a space" + ], + "separate_ctrl_name_with_space": false, + "_help_separate_fn_name_with_space": [ + "If true, separate function names from parentheses with a", + "space" + ], + "separate_fn_name_with_space": false, + "_help_dangle_parens": [ + "If a statement is wrapped to more than one line, than dangle", + "the closing parenthesis on its own line." + ], + "dangle_parens": false, + "_help_dangle_align": [ + "If the trailing parenthesis must be 'dangled' on its on", + "line, then align it to this reference: `prefix`: the start", + "of the statement, `prefix-indent`: the start of the", + "statement, plus one indentation level, `child`: align to", + "the column of the arguments" + ], + "dangle_align": "prefix", + "_help_min_prefix_chars": [ + "If the statement spelling length (including space and", + "parenthesis) is smaller than this amount, then force reject", + "nested layouts." + ], + "min_prefix_chars": 4, + "_help_max_prefix_chars": [ + "If the statement spelling length (including space and", + "parenthesis) is larger than the tab width by more than this", + "amount, then force reject un-nested layouts." + ], + "max_prefix_chars": 10, + "_help_max_lines_hwrap": [ + "If a candidate layout is wrapped horizontally but it exceeds", + "this many lines, then reject the layout." + ], + "max_lines_hwrap": 2, + "_help_line_ending": [ + "What style line endings to use in the output." + ], + "line_ending": "unix", + "_help_command_case": [ + "Format command names consistently as 'lower' or 'upper' case" + ], + "command_case": "canonical", + "_help_keyword_case": [ + "Format keywords consistently as 'lower' or 'upper' case" + ], + "keyword_case": "unchanged", + "_help_always_wrap": [ + "A list of command names which should always be wrapped" + ], + "always_wrap": [], + "_help_enable_sort": [ + "If true, the argument lists which are known to be sortable", + "will be sorted lexicographicall" + ], + "enable_sort": true, + "_help_autosort": [ + "If true, the parsers may infer whether or not an argument", + "list is sortable (without annotation)." + ], + "autosort": false, + "_help_require_valid_layout": [ + "By default, if cmake-format cannot successfully fit", + "everything into the desired linewidth it will apply the", + "last, most agressive attempt that it made. If this flag is", + "True, however, cmake-format will print error, exit with non-", + "zero status code, and write-out nothing" + ], + "require_valid_layout": false, + "_help_layout_passes": [ + "A dictionary mapping layout nodes to a list of wrap", + "decisions. See the documentation for more information." + ], + "layout_passes": {} + }, + "_help_markup": "Options affecting comment reflow and formatting.", + "markup": { + "_help_bullet_char": [ + "What character to use for bulleted lists" + ], + "bullet_char": "*", + "_help_enum_char": [ + "What character to use as punctuation after numerals in an", + "enumerated list" + ], + "enum_char": ".", + "_help_first_comment_is_literal": [ + "If comment markup is enabled, don't reflow the first comment", + "block in each listfile. Use this to preserve formatting of", + "your copyright/license statements." + ], + "first_comment_is_literal": false, + "_help_literal_comment_pattern": [ + "If comment markup is enabled, don't reflow any comment block", + "which matches this (regex) pattern. Default is `None`", + "(disabled)." + ], + "literal_comment_pattern": null, + "_help_fence_pattern": [ + "Regular expression to match preformat fences in comments", + "default= ``r'^\\s*([`~]{3}[`~]*)(.*)$'``" + ], + "fence_pattern": "^\\s*([`~]{3}[`~]*)(.*)$", + "_help_ruler_pattern": [ + "Regular expression to match rulers in comments default=", + "``r'^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'``" + ], + "ruler_pattern": "^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$", + "_help_explicit_trailing_pattern": [ + "If a comment line matches starts with this pattern then it", + "is explicitly a trailing comment for the preceeding", + "argument. Default is '#<'" + ], + "explicit_trailing_pattern": "#<", + "_help_hashruler_min_length": [ + "If a comment line starts with at least this many consecutive", + "hash characters, then don't lstrip() them off. This allows", + "for lazy hash rulers where the first hash char is not", + "separated by space" + ], + "hashruler_min_length": 10, + "_help_canonicalize_hashrulers": [ + "If true, then insert a space between the first hash char and", + "remaining hash chars in a hash ruler, and normalize its", + "length to fill the column" + ], + "canonicalize_hashrulers": true, + "_help_enable_markup": [ + "enable comment markup parsing and reflow" + ], + "enable_markup": true + }, + "_help_lint": "Options affecting the linter", + "lint": { + "_help_disabled_codes": [ + "a list of lint codes to disable" + ], + "disabled_codes": [], + "_help_function_pattern": [ + "regular expression pattern describing valid function names" + ], + "function_pattern": "[0-9a-z_]+", + "_help_macro_pattern": [ + "regular expression pattern describing valid macro names" + ], + "macro_pattern": "[0-9A-Z_]+", + "_help_global_var_pattern": [ + "regular expression pattern describing valid names for", + "variables with global (cache) scope" + ], + "global_var_pattern": "[A-Z][0-9A-Z_]+", + "_help_internal_var_pattern": [ + "regular expression pattern describing valid names for", + "variables with global scope (but internal semantic)" + ], + "internal_var_pattern": "_[A-Z][0-9A-Z_]+", + "_help_local_var_pattern": [ + "regular expression pattern describing valid names for", + "variables with local scope" + ], + "local_var_pattern": "[a-z][a-z0-9_]+", + "_help_private_var_pattern": [ + "regular expression pattern describing valid names for", + "privatedirectory variables" + ], + "private_var_pattern": "_[0-9a-z_]+", + "_help_public_var_pattern": [ + "regular expression pattern describing valid names for public", + "directory variables" + ], + "public_var_pattern": "[A-Z][0-9A-Z_]+", + "_help_argument_var_pattern": [ + "regular expression pattern describing valid names for", + "function/macro arguments and loop variables." + ], + "argument_var_pattern": "[a-z][a-z0-9_]+", + "_help_keyword_pattern": [ + "regular expression pattern describing valid names for", + "keywords used in functions or macros" + ], + "keyword_pattern": "[A-Z][0-9A-Z_]+", + "_help_max_conditionals_custom_parser": [ + "In the heuristic for C0201, how many conditionals to match", + "within a loop in before considering the loop a parser." + ], + "max_conditionals_custom_parser": 2, + "_help_min_statement_spacing": [ + "Require at least this many newlines between statements" + ], + "min_statement_spacing": 1, + "_help_max_statement_spacing": [ + "Require no more than this many newlines between statements" + ], + "max_statement_spacing": 2, + "max_returns": 6, + "max_branches": 12, + "max_arguments": 5, + "max_localvars": 15, + "max_statements": 50 + }, + "_help_encode": "Options affecting file encoding", + "encode": { + "_help_emit_byteorder_mark": [ + "If true, emit the unicode byte-order mark (BOM) at the start", + "of the file" + ], + "emit_byteorder_mark": false, + "_help_input_encoding": [ + "Specify the encoding of the input file. Defaults to utf-8" + ], + "input_encoding": "utf-8", + "_help_output_encoding": [ + "Specify the encoding of the output file. Defaults to utf-8.", + "Note that cmake only claims to support utf-8 so be careful", + "when using anything else" + ], + "output_encoding": "utf-8" + }, + "_help_misc": "Miscellaneous configurations options.", + "misc": { + "_help_per_command": [ + "A dictionary containing any per-command configuration", + "overrides. Currently only `command_case` is supported." + ], + "per_command": {} + } +} diff --git a/cpp/cmake/modules/cuda_configuration.cmake b/cpp/cmake/modules/cuda_configuration.cmake index acef6d2dac..251cfbd8f2 100644 --- a/cpp/cmake/modules/cuda_configuration.cmake +++ b/cpp/cmake/modules/cuda_configuration.cmake @@ -15,10 +15,207 @@ # the License. # +#[=======================================================================[.rst: +CudaConfiguration +----------------- + +CUDA compiler and architecture configuration for TensorRT-LLM. + +This module provides functions and macros to configure the CUDA compiler, +manage CUDA architectures, and filter source files based on target +architectures. It is tailored to meet TensorRT-LLM's specific requirements +for optimized kernel compilation across multiple GPU generations. + +Macros +^^^^^^ + +.. command:: setup_cuda_compiler + + Detects and validates the CUDA compiler:: + + setup_cuda_compiler() + + This macro determines the CUDA compiler version before enabling the CUDA + language extension. It requires CUDA version 11.2 or later. + + The macro sets ``CMAKE_CUDA_COMPILER_VERSION`` upon successful detection. + +Functions +^^^^^^^^^ + +.. command:: setup_cuda_architectures + + Initializes and normalizes ``CMAKE_CUDA_ARCHITECTURES``:: + + setup_cuda_architectures() + + This function processes the ``CMAKE_CUDA_ARCHITECTURES`` variable and + configures architecture-specific compilation settings. This function should + be called after enabling the CUDA language extension. + + **Special Values for CMAKE_CUDA_ARCHITECTURES:** + + ``native`` + Resolves to the highest available architecture on the system. + Falls back to ``all`` if detection fails. + + ``all`` or unset + Resolves to architectures TensorRT-LLM is optimized for and the + compiler supports (80, 86, 89, 90, 100, 103, 120 depending on CUDA version). + + ``all-major`` + Unsupported. Results in a fatal error. + + **Architecture Processing:** + + * PTX is never included in the result binary (``-virtual`` rejected). + * The ``-real`` suffix is automatically added to exclude PTX. + * Accelerated targets (``-a`` suffix) are used for SM 90+. + * On CUDA 12.9+, family targets (``-f`` suffix) are used for SM 100+. + + **Output Variables (set in parent scope):** + + ``CMAKE_CUDA_ARCHITECTURES`` + Normalized list with appropriate suffixes (e.g., ``80-real``, ``90a-real``, + ``100f-real``). + + ``CMAKE_CUDA_ARCHITECTURES_ORIG`` + Original list of enabled architectures without suffixes. + + ``CMAKE_CUDA_ARCHITECTURES_FAMILIES`` + List of family architectures (e.g., ``100f``, ``120f``). + + ``CMAKE_CUDA_ARCHITECTURES_HAS_FAMILIES`` + Boolean indicating if family targets are supported. + + ``CMAKE_CUDA_MIN_ARCHITECTURE_HAS_ACCEL`` + Minimum architecture supporting accelerated (``-a``) suffix. + + ``CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY`` + Minimum architecture supporting family (``-f``) suffix. + +.. command:: add_cuda_architectures + + Appends CUDA architectures to an existing target:: + + add_cuda_architectures( [ ...]) + + Adds the specified architectures to ````'s ``CUDA_ARCHITECTURES`` + property. The ``-a`` suffix is automatically added for supported + architectures. Architectures are only added if they were explicitly + requested by the user in ``CMAKE_CUDA_ARCHITECTURES_ORIG``. + +.. command:: set_cuda_architectures + + Sets CUDA architectures for a target:: + + set_cuda_architectures( [ ...]) + + Replaces the ``CUDA_ARCHITECTURES`` property of ```` with the + specified architectures. + + **Architecture Specification:** + + * Architectures may include the ``f`` suffix for family-conditional + compilation (e.g., ``100f``). + * Non-family architectures are only added if explicitly requested. + * Family architectures are only added if requested architectures would + enable compilation for that family. + + If no architectures are enabled for the target, it compiles with + ``PLACEHOLDER_KERNELS`` macro defined. The kernel source shall compile + with any architecture if ``PLACEHOLDER_KERNELS`` macro is defined. + +.. command:: filter_source_cuda_architectures + + Filters source files based on enabled CUDA architectures:: + + filter_source_cuda_architectures( + SOURCE_LIST + TARGET + ARCHS [ ...] + [IMPLICIT_FAMILY] + ) + + Removes source files targeting disabled CUDA architectures from the + source list. Files are matched by patterns like ``sm80``, ``sm_80``, + ``SM80``, etc. in their filenames (for ``.cu`` and ``cubin.cpp`` files). + + ``SOURCE_LIST `` + Name of the variable containing the list of source files. + Modified in place to remove filtered files. + + ``TARGET `` + Target to add compile definitions to. If the target does not exist, + an INTERFACE library will be created. + + ``ARCHS [ ...]`` + List of architectures to check. May include ``f`` suffix. + + ``IMPLICIT_FAMILY`` + When set, treats architectures >= ``CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY`` + as implicitly family-enabled. + + **Defined Macros:** + + For each filtered architecture, a compile definition ``EXCLUDE_SM_`` + (or ``EXCLUDE_SM_F`` for family architectures) is added to ````. + +Example +^^^^^^^ + +.. code-block:: cmake + + include(cuda_configuration) + + # Setup compiler and detect version + setup_cuda_compiler() + + # enable_language, or project(project_name LANGUAGES CUDA) + # must be called after setup_cuda_compiler() and before + # setup_cuda_architectures() + enable_language(CUDA) + + # Configure architectures (uses CMAKE_CUDA_ARCHITECTURES if set) + setup_cuda_architectures() + + # Add additional architecture to compile for, if it is beneficial. + # e.g. Utilizing native FP8 support available in sm89 (Ada) + # but not in sm86 (Ampere) + # Note: The kernel source must still compiles for all the architectures, + # by using less performant implementation. + add_library(my_kernels_fp8 STATIC kernels.cu) + add_cuda_architectures(my_kernels_fp8 89) + + # Set specific architecture this source should compile for. + # e.g. Kernels using WGMMA instructions + # Note: The kernel source must still compiles for other architectures when + # ``PLACEHOLDER_KERNELS`` macro is defined. + add_library(my_kernels_sm90_only STATIC kernels.cu) + set_cuda_architectures(my_kernels_sm90_only 90) + + # Filter sources for disabled architectures + set(KERNEL_SOURCES + kernel_sm80.cubin.cpp + kernel_sm90.cubin.cpp + kernel_sm100.cubin.cpp + ) + filter_source_cuda_architectures( + SOURCE_LIST KERNEL_SOURCES + TARGET my_kernel_interface + ARCHS 80 90 100 + ) + # ``my_kernel_interface`` target is created with definitions to exclude + # disabled architectures. + +#]=======================================================================] + +#[[ +Determine CUDA version before enabling the language extension +check_language(CUDA) clears CMAKE_CUDA_HOST_COMPILER if CMAKE_CUDA_COMPILER +is not set +#]] macro(setup_cuda_compiler) - # Determine CUDA version before enabling the language extension - # check_language(CUDA) clears CMAKE_CUDA_HOST_COMPILER if CMAKE_CUDA_COMPILER - # is not set include(CheckLanguage) if(NOT CMAKE_CUDA_COMPILER AND CMAKE_CUDA_HOST_COMPILER) set(CMAKE_CUDA_HOST_COMPILER_BACKUP ${CMAKE_CUDA_HOST_COMPILER}) @@ -70,25 +267,28 @@ macro(setup_cuda_compiler) endif() endmacro() -function(setup_cuda_architectures) - # cmake-format: off - # Initialize and normalize CMAKE_CUDA_ARCHITECTURES. - # Special values: - # * `native` is resolved to HIGHEST available architecture. - # * Fallback to `all` if detection failed. - # * `all`/unset is resolved to a set of architectures we optimized for and compiler supports. - # * `all-major` is unsupported. - # Numerical architectures: - # * PTX is never included in result binary. - # * `*-virtual` architectures are therefore rejected. - # * `-real` suffix is automatically added to exclude PTX. - # * Always use accelerated (`-a` suffix) target for supported architectures. - # * On CUDA 12.9 or newer, family (`-f` suffix) target will be used for supported architectures to reduce number of - # targets to compile for. - # * Extra architectures can be requested via add_cuda_architectures - # for kernels that benefit from arch specific features. - # cmake-format: on +#[[ +Initialize and normalize CMAKE_CUDA_ARCHITECTURES. +Special values: + +* `native` is resolved to HIGHEST available architecture. + * Fallback to `all` if detection failed. +* `all`/unset is resolved to a set of architectures we optimized for and compiler supports. +* `all-major` is unsupported. + +Numerical architectures: + +* PTX is never included in result binary. + * `*-virtual` architectures are therefore rejected. + * `-real` suffix is automatically added to exclude PTX. +* Always use accelerated (`-a` suffix) target for supported architectures. +* On CUDA 12.9 or newer, family (`-f` suffix) target will be used for supported architectures to reduce number of + targets to compile for. + * Extra architectures can be requested via add_cuda_architectures + for kernels that benefit from arch specific features. +#]] +function(setup_cuda_architectures) set(CMAKE_CUDA_ARCHITECTURES_RAW ${CMAKE_CUDA_ARCHITECTURES}) if(CMAKE_CUDA_ARCHITECTURES_RAW STREQUAL "native") # Detect highest available compute capability @@ -138,9 +338,6 @@ function(setup_cuda_architectures) message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}") endif() endforeach() - if("103" IN_LIST CMAKE_CUDA_ARCHITECTURES_CLEAN) - list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN "100") - endif() list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_CLEAN) set(CMAKE_CUDA_ARCHITECTURES_RAW ${CMAKE_CUDA_ARCHITECTURES_CLEAN}) endif() @@ -182,22 +379,29 @@ function(setup_cuda_architectures) endforeach() # -a suffix supported from Hopper (90) - set(MIN_ARCHITECTURE_HAS_ACCEL 90) + set(CMAKE_CUDA_MIN_ARCHITECTURE_HAS_ACCEL 90) + set(CMAKE_CUDA_MIN_ARCHITECTURE_HAS_ACCEL + ${CMAKE_CUDA_MIN_ARCHITECTURE_HAS_ACCEL} + PARENT_SCOPE) # -f suffix supported from Blackwell (100) starting from CUDA 12.9. if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9") - set(MIN_ARCHITECTURE_HAS_FAMILY 100) + set(CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY 100) set(CMAKE_CUDA_ARCHITECTURES_HAS_FAMILIES ON PARENT_SCOPE) else() # -a provides no cross architecture compatibility, but luckily until CUDA # 12.8 We have only one architecture within each family >= 9. - set(MIN_ARCHITECTURE_HAS_FAMILY 9999) # Effectively exclude all - # architectures + set(CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY 9999) # Effectively exclude all + # architectures set(CMAKE_CUDA_ARCHITECTURES_HAS_FAMILIES OFF PARENT_SCOPE) endif() + set(CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY + ${CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY} + PARENT_SCOPE) + # Compatibility low bounds: Always compile kernels for these architectures. 86 # is enabled to avoid perf regression when using 80 kernels. set(ARCHITECTURES_COMPATIBILITY_BASE 80 86 90 100 120) @@ -252,11 +456,11 @@ function(setup_cuda_architectures) set(CMAKE_CUDA_ARCHITECTURES_NORMALIZED) set(CMAKE_CUDA_ARCHITECTURES_FAMILIES) foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_NORMALIZED_LIST) - if(CUDA_ARCH GREATER_EQUAL ${MIN_ARCHITECTURE_HAS_FAMILY} + if(CUDA_ARCH GREATER_EQUAL ${CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY} AND NOT CUDA_ARCH IN_LIST ARCHITECTURES_NO_COMPATIBILITY) list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}f-real") list(APPEND CMAKE_CUDA_ARCHITECTURES_FAMILIES "${CUDA_ARCH}f") - elseif(CUDA_ARCH GREATER_EQUAL ${MIN_ARCHITECTURE_HAS_ACCEL}) + elseif(CUDA_ARCH GREATER_EQUAL ${CMAKE_CUDA_MIN_ARCHITECTURE_HAS_ACCEL}) list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}a-real") else() list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}-real") @@ -271,17 +475,15 @@ function(setup_cuda_architectures) PARENT_SCOPE) endfunction() +#[[ +Add CUDA architectures to target. +-a suffix is added automatically for supported architectures. +Architectures are added only if user explicitly requested support for that architecture. +#]] function(add_cuda_architectures target) - # cmake-format: off - # Add CUDA architectures to target. - # -a suffix is added automatically for supported architectures. - # Architectures are added only if user explicitly requested support for that architecture. - # cmake-format: on - set(MIN_ARCHITECTURE_HAS_ACCEL 90) - foreach(CUDA_ARCH IN LISTS ARGN) if(${CUDA_ARCH} IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) - if(${CUDA_ARCH} GREATER_EQUAL ${MIN_ARCHITECTURE_HAS_ACCEL}) + if(${CUDA_ARCH} GREATER_EQUAL ${CMAKE_CUDA_MIN_ARCHITECTURE_HAS_ACCEL}) set(REAL_CUDA_ARCH "${CUDA_ARCH}a-real") else() set(REAL_CUDA_ARCH "${CUDA_ARCH}-real") @@ -294,18 +496,19 @@ function(add_cuda_architectures target) endforeach() endfunction() -function(set_cuda_architectures target) - # cmake-format: off - # Set CUDA architectures for a target. - # -a suffix is added automatically for supported architectures. - # Architectures passed in may be specified with -f suffix to build family conditional version of the kernel. - # Non-family architectures are added only if user explicitly requested support for that architecture. - # Family conditional architectures are only added if user requested architectures would enable compilation for it. - # If user requested no architectures set on the target, - # the target will be compiled with `PLACEHOLDER_KERNELS` macro defined. - # cmake-format: on - set(MIN_ARCHITECTURE_HAS_ACCEL 90) +#[[ +Set CUDA architectures for a target. +-a suffix is added automatically for supported architectures. +Architectures passed in may be specified with -f suffix to build family conditional version of the kernel. + +Non-family architectures are added only if user explicitly requested support for that architecture. +Family conditional architectures are only added if user requested architectures would enable compilation for it. + +If user requested no architectures set on the target, +the target will be compiled with `PLACEHOLDER_KERNELS` macro defined. +#]] +function(set_cuda_architectures target) set(CUDA_ARCHITECTURES "") foreach(CUDA_ARCH IN LISTS ARGN) if(${CUDA_ARCH} MATCHES "[0-9]+f") @@ -326,7 +529,7 @@ function(set_cuda_architectures target) endforeach() endif() elseif(${CUDA_ARCH} IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) - if(${CUDA_ARCH} GREATER_EQUAL ${MIN_ARCHITECTURE_HAS_ACCEL}) + if(${CUDA_ARCH} GREATER_EQUAL ${CMAKE_CUDA_MIN_ARCHITECTURE_HAS_ACCEL}) list(APPEND CUDA_ARCHITECTURES "${CUDA_ARCH}a-real") else() list(APPEND CUDA_ARCHITECTURES "${CUDA_ARCH}-real") @@ -342,3 +545,153 @@ function(set_cuda_architectures target) ${CUDA_ARCHITECTURES}) endif() endfunction() + +#[[ +Filter out source files targeting CUDA architectures not enabled. + +Arguments: + SOURCE_LIST - Name of the variable containing the list of source files to filter + TARGET - Target to add compile definitions to. If the target does not exist, + an INTERFACE library will be created. + ARCHS - List of architectures to check and potentially filter + IMPLICIT_FAMILY - Optional flag to enable implicit family mode + +For each ARCH passed in: + +- if IMPLICIT_FAMILY is not set: + - if ARCH is not suffixed by f: + if ARCH is not in CMAKE_CUDA_ARCHITECTURES_ORIG, source files containing "sm${ARCH}" + but not "sm${ARCH}f" (case insensitive) will be excluded + Macro "EXCLUDE_SM_${ARCH}" will be defined on TARGET + - if ARCH is suffixed by f, NARCH is ARCH without f suffix: + if ARCH is not in CMAKE_CUDA_ARCHITECTURES_FAMILIES, source files containing + "sm${NARCH}f" (case insensitive) will be excluded + Macro "EXCLUDE_SM_${NARCH}F" will be defined on TARGET + +- if IMPLICIT_FAMILY is set: + ARCH shall not suffixed by f. + - if ARCH >= CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY: + if "${ARCH}f" is not in CMAKE_CUDA_ARCHITECTURES_FAMILIES, + source files containing "sm${ARCH}" but not "sm${ARCH}a" (case insensitive) will be excluded + Macro "EXCLUDE_SM_${ARCH}" (no F) will be defined on TARGET + - else: + if "${ARCH}" is not in CMAKE_CUDA_ARCHITECTURES_ORIG, + source files containing "sm${ARCH}" (case insensitive) will be excluded + Macro "EXCLUDE_SM_${ARCH}" will be defined on TARGET +#]] +function(filter_source_cuda_architectures) + set(options IMPLICIT_FAMILY) + set(oneValueArgs SOURCE_LIST TARGET) + set(multiValueArgs ARCHS) + + cmake_parse_arguments(PARSE_ARGV 0 arg "${options}" "${oneValueArgs}" + "${multiValueArgs}") + set(SOURCES "${${arg_SOURCE_LIST}}") + + if(NOT TARGET ${arg_TARGET}) + add_library(${arg_TARGET} INTERFACE) + endif() + + # Determine if target is INTERFACE library to use correct visibility + get_target_property(_target_type ${arg_TARGET} TYPE) + if(_target_type STREQUAL "INTERFACE_LIBRARY") + set(_compile_def_visibility INTERFACE) + else() + set(_compile_def_visibility PUBLIC) + endif() + + foreach(ARCH IN LISTS arg_ARCHS) + set(SHOULD_FILTER FALSE) + set(MATCH_PATTERN "") + set(EXCLUDE_PATTERN "") + set(ARCH_FOR_DEFINE "") + + if(NOT arg_IMPLICIT_FAMILY) + # Check if ARCH ends with 'f' + string(REGEX MATCH "^(.+)f$" _has_f_suffix "${ARCH}") + + if(_has_f_suffix) + # ARCH is suffixed by 'f' (e.g., "100f") + set(BASE_ARCH "${CMAKE_MATCH_1}") + if(NOT "${ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_FAMILIES) + set(SHOULD_FILTER TRUE) + set(ARCH_FOR_DEFINE "${BASE_ARCH}F") + # Match "sm${BASE_ARCH}f" - straightforward match, no exclusion + # pattern needed + set(MATCH_PATTERN ".*[Ss][Mm]_?${BASE_ARCH}f.*(cubin\.cpp|\.cu)$") + endif() + else() + # ARCH is NOT suffixed by 'f' (e.g., "80") + if(NOT "${ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) + set(SHOULD_FILTER TRUE) + set(ARCH_FOR_DEFINE "${ARCH}") + # Match "sm${ARCH}" but NOT "sm${ARCH}f" + set(MATCH_PATTERN ".*[Ss][Mm]_?${ARCH}.*(cubin\.cpp|\.cu)$") + set(EXCLUDE_PATTERN ".*[Ss][Mm]_?${ARCH}f.*(cubin\.cpp|\.cu)$") + endif() + endif() + else() + # IMPLICIT_FAMILY is set - ARCH shall not be suffixed by 'f' + if(${ARCH} GREATER_EQUAL ${CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY}) + # ARCH >= CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY + if(NOT "${ARCH}f" IN_LIST CMAKE_CUDA_ARCHITECTURES_FAMILIES) + set(SHOULD_FILTER TRUE) + set(ARCH_FOR_DEFINE "${ARCH}") + # Match "sm${ARCH}" but NOT "sm${ARCH}a" + set(MATCH_PATTERN ".*[Ss][Mm]_?${ARCH}.*(cubin\.cpp|\.cu)$") + set(EXCLUDE_PATTERN ".*[Ss][Mm]_?${ARCH}a.*(cubin\.cpp|\.cu)$") + endif() + else() + # ARCH < CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY + if(NOT "${ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) + set(SHOULD_FILTER TRUE) + set(ARCH_FOR_DEFINE "${ARCH}") + # Match "sm${ARCH}" - no exclusion pattern needed + set(MATCH_PATTERN ".*[Ss][Mm]_?${ARCH}.*(cubin\.cpp|\.cu)$") + endif() + endif() + endif() + + if(SHOULD_FILTER) + # Get files matching the main pattern + set(SOURCES_TO_CHECK "${SOURCES}") + list(FILTER SOURCES_TO_CHECK INCLUDE REGEX "${MATCH_PATTERN}") + + if(NOT "${EXCLUDE_PATTERN}" STREQUAL "") + # Find files matching the exclusion pattern (these should be kept) + set(SOURCES_TO_KEEP "${SOURCES_TO_CHECK}") + list(FILTER SOURCES_TO_KEEP INCLUDE REGEX "${EXCLUDE_PATTERN}") + # Remove the files we want to keep from the check list + if(SOURCES_TO_KEEP) + list(REMOVE_ITEM SOURCES_TO_CHECK ${SOURCES_TO_KEEP}) + endif() + endif() + + set(SOURCES_FILTERED "${SOURCES_TO_CHECK}") + + list(LENGTH SOURCES_FILTERED SOURCES_FILTERED_LEN) + message( + STATUS + "Excluding ${SOURCES_FILTERED_LEN} cubins for SM ${ARCH} from ${CMAKE_CURRENT_SOURCE_DIR}" + ) + foreach(filtered_item IN LISTS SOURCES_FILTERED) + message(VERBOSE "- ${filtered_item}") + endforeach() + + # Remove filtered files from sources + if(SOURCES_FILTERED) + list(REMOVE_ITEM SOURCES ${SOURCES_FILTERED}) + endif() + + # Add compile definition to target + target_compile_definitions( + ${arg_TARGET} + ${_compile_def_visibility} + "EXCLUDE_SM_${ARCH_FOR_DEFINE}") + endif() + endforeach() + + set(${arg_SOURCE_LIST} + "${SOURCES}" + PARENT_SCOPE) +endfunction() diff --git a/cpp/tensorrt_llm/common/CMakeLists.txt b/cpp/tensorrt_llm/common/CMakeLists.txt index 7243c6a5d2..73de3d6113 100644 --- a/cpp/tensorrt_llm/common/CMakeLists.txt +++ b/cpp/tensorrt_llm/common/CMakeLists.txt @@ -36,6 +36,7 @@ add_library(common_src OBJECT ${SRCS} ${CU_SRCS}) add_cuda_architectures(common_src 89) set_property(TARGET common_src PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET common_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(common_src PUBLIC trtllm_gen_fmha_interface) if(ENABLE_CUBLASLT_FP4_GEMM) target_compile_definitions(common_src PRIVATE ENABLE_CUBLASLT_FP4_GEMM) diff --git a/cpp/tensorrt_llm/kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/CMakeLists.txt index f709496b5b..541de36515 100644 --- a/cpp/tensorrt_llm/kernels/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/CMakeLists.txt @@ -15,6 +15,20 @@ # the License. # +add_subdirectory(cutlass_kernels) +add_subdirectory(cuteDslKernels) +add_subdirectory(flashMLA) +add_subdirectory(contextFusedMultiHeadAttention) +add_subdirectory(decoderMaskedMultiheadAttention) +add_subdirectory(selectiveScan) +add_subdirectory(userbuffers) +add_subdirectory(trtllmGenKernels) +add_subdirectory(fusedLayernormKernels) +add_subdirectory(groupRmsNormKernels) +add_subdirectory(llama4MinLatencyKernels) +add_subdirectory(dsv3MinLatencyKernels) +add_subdirectory(causalConv1d) + file(GLOB_RECURSE SRC_CPP *.cpp) file(GLOB_RECURSE SRC_CU *.cu) @@ -38,28 +52,6 @@ list(FILTER SRC_CPP EXCLUDE REGEX "userbuffers/.*") list(FILTER SRC_CU EXCLUDE REGEX "userbuffers/.*") list(FILTER SRC_CU EXCLUDE REGEX "fusedLayernormKernels/.*") -function(filter_cuda_archs ARCH SOURCES_VAR) - if(NOT "${ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) - set(FILTER_REGEX ".*[Ss][Mm]_?${ARCH}(af)?.*(cubin\.cpp|\.cu)$") - list(APPEND SOURCES ${${SOURCES_VAR}}) - list(APPEND SOURCES_FILTERED ${SOURCES}) - list(FILTER SOURCES_FILTERED INCLUDE REGEX "${FILTER_REGEX}") - list(LENGTH SOURCES_FILTERED SOURCES_FILTERED_LEN) - message( - STATUS - "Excluding ${SOURCES_FILTERED_LEN} cubins for SM ${ARCH} from ${CMAKE_CURRENT_SOURCE_DIR}" - ) - foreach(filtered_item ${SOURCES_FILTERED}) - message(VERBOSE "- ${filtered_item}") - endforeach() - list(FILTER SOURCES EXCLUDE REGEX "${FILTER_REGEX}") - set(${SOURCES_VAR} - "${SOURCES}" - PARENT_SCOPE) - add_compile_definitions("EXCLUDE_SM_${ARCH}") - endif() -endfunction() - if(NOT ENABLE_MULTI_DEVICE) list(FILTER SRC_CU EXCLUDE REGEX "customAllReduceKernels*.*cu$") endif() @@ -72,18 +64,5 @@ target_include_directories( PUBLIC $ ) +target_link_libraries(kernels_src PUBLIC trtllm_gen_fmha_interface) add_cuda_architectures(kernels_src 89) - -add_subdirectory(cutlass_kernels) -add_subdirectory(cuteDslKernels) -add_subdirectory(flashMLA) -add_subdirectory(contextFusedMultiHeadAttention) -add_subdirectory(decoderMaskedMultiheadAttention) -add_subdirectory(selectiveScan) -add_subdirectory(userbuffers) -add_subdirectory(trtllmGenKernels) -add_subdirectory(fusedLayernormKernels) -add_subdirectory(groupRmsNormKernels) -add_subdirectory(llama4MinLatencyKernels) -add_subdirectory(dsv3MinLatencyKernels) -add_subdirectory(causalConv1d) diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/CMakeLists.txt b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/CMakeLists.txt index 7c96b36bdf..ccc48ff2ac 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/CMakeLists.txt @@ -19,42 +19,56 @@ file(GLOB_RECURSE SRC_CPP *.cpp) file(GLOB_RECURSE SRC_CU *.cu) list(FILTER SRC_CU EXCLUDE REGEX "fmha_v2_cu/.*") -filter_cuda_archs("80" SRC_CPP) -filter_cuda_archs("86" SRC_CPP) -filter_cuda_archs("89" SRC_CPP) -filter_cuda_archs("90" SRC_CPP) -filter_cuda_archs("100" SRC_CPP) -filter_cuda_archs("120" SRC_CPP) +add_library(context_attention_src OBJECT) -add_library(context_attention_src OBJECT ${SRC_CPP} ${SRC_CU}) +filter_source_cuda_architectures( + SOURCE_LIST SRC_CPP + ARCHS 80 86 89 90 100 120 + TARGET context_attention_src + IMPLICIT_FAMILY) + +target_sources(context_attention_src PRIVATE ${SRC_CPP} ${SRC_CU}) target_compile_definitions(context_attention_src PRIVATE USE_DEMO_BERT_PARAMS=1 GENERATE_CUBIN=1) set_target_properties( context_attention_src PROPERTIES POSITION_INDEPENDENT_CODE ON CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(context_attention_src PUBLIC trtllm_gen_fmha_interface) foreach(arch IN ITEMS 80 86 89 90 100 120) - if("${arch}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) - file(GLOB arch_files "fmha_v2_cu/*_sm${arch}.cu") - if(arch_files) - set(TARGET_NAME _context_attention_kernels_${arch}) - add_library(${TARGET_NAME} OBJECT ${arch_files}) - target_compile_definitions(${TARGET_NAME} PRIVATE USE_DEMO_BERT_PARAMS=1 - GENERATE_CUBIN=1) - set_target_properties( - ${TARGET_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON - CUDA_RESOLVE_DEVICE_SYMBOLS ON) - target_include_directories( - ${TARGET_NAME} - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../../kernels/fmha_v2/src/ - ${CMAKE_CURRENT_SOURCE_DIR}/../../../kernels/fmha_v2/generated/) - if(${arch} GREATER_EQUAL 100) - set_cuda_architectures(${TARGET_NAME} "${arch}f") - else() - set_cuda_architectures(${TARGET_NAME} ${arch}) - endif() - target_sources(context_attention_src - PUBLIC $) + set(ARCH_ENABLED FALSE) + set(TARGET_ARCH "") + if(${arch} GREATER_EQUAL ${CMAKE_CUDA_MIN_ARCHITECTURE_HAS_FAMILY}) + if("${arch}f" IN_LIST CMAKE_CUDA_ARCHITECTURES_FAMILIES) + set(ARCH_ENABLED TRUE) + set(TARGET_ARCH "${arch}f") + endif() + else() + if("${arch}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) + set(ARCH_ENABLED TRUE) + set(TARGET_ARCH "${arch}") endif() endif() + + if(NOT ${ARCH_ENABLED}) + continue() + endif() + + file(GLOB arch_files "fmha_v2_cu/*_sm${arch}.cu") + if(arch_files) + set(TARGET_NAME _context_attention_kernels_${arch}) + add_library(${TARGET_NAME} OBJECT ${arch_files}) + target_compile_definitions(${TARGET_NAME} PRIVATE USE_DEMO_BERT_PARAMS=1 + GENERATE_CUBIN=1) + set_target_properties( + ${TARGET_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_include_directories( + ${TARGET_NAME} + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../../kernels/fmha_v2/src/ + ${CMAKE_CURRENT_SOURCE_DIR}/../../../kernels/fmha_v2/generated/) + set_cuda_architectures(${TARGET_NAME} ${TARGET_ARCH}) + target_sources(context_attention_src + PUBLIC $) + endif() endforeach() diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/CMakeLists.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/CMakeLists.txt index a1c1178f10..b87e5b19af 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/CMakeLists.txt @@ -27,11 +27,12 @@ set(SRC_CU_EXTRA) list(FILTER SRC_CPP EXCLUDE REGEX ".*nvrtcWrapper/.*") add_subdirectory(decoderXQAImplJIT/nvrtcWrapper) -filter_cuda_archs("80" SRC_CPP) -filter_cuda_archs("86" SRC_CPP) -filter_cuda_archs("89" SRC_CPP) -filter_cuda_archs("90" SRC_CPP) -filter_cuda_archs("120" SRC_CPP) +add_library(decoder_attention_src OBJECT) +filter_source_cuda_architectures( + SOURCE_LIST SRC_CPP + ARCHS 80 86 89 90 120 + TARGET decoder_attention_src + IMPLICIT_FAMILY) set(basic_heads 32 64 128) foreach(HEAD ${basic_heads}) @@ -64,7 +65,7 @@ foreach(HEAD ${extra_heads}) endforeach() if(NOT WIN32) - add_library(decoder_attention_src OBJECT ${SRC_CPP}) + target_sources(decoder_attention_src PRIVATE ${SRC_CPP}) # Split some sources to shared library for Linux add_library(${DECODER_SHARED_TARGET_0} SHARED ${SRC_CU}) @@ -78,10 +79,12 @@ if(NOT WIN32) set_property(TARGET ${DECODER_SHARED_TARGET_1} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) else() - add_library(decoder_attention_src OBJECT ${SRC_CPP} ${SRC_CU} ${SRC_CU_EXTRA}) + target_sources(decoder_attention_src PRIVATE ${SRC_CPP} ${SRC_CU} + ${SRC_CU_EXTRA}) endif() -target_link_libraries(decoder_attention_src PUBLIC nvrtc_wrapper_src) +target_link_libraries(decoder_attention_src PUBLIC nvrtc_wrapper_src + trtllm_gen_fmha_interface) set_property(TARGET decoder_attention_src PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET decoder_attention_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/CMakeLists.txt b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/CMakeLists.txt index f50775a56f..b007f1fd7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/CMakeLists.txt @@ -18,9 +18,13 @@ file(GLOB_RECURSE SRC_CPP *.cpp) file(GLOB_RECURSE SRC_CU *.cu) -filter_cuda_archs("100" SRC_CPP) +add_library(trtllm_gen_batched_gemm OBJECT) +filter_source_cuda_architectures( + SOURCE_LIST SRC_CPP + ARCHS 100 103 100f + TARGET trtllm_gen_batched_gemm) -add_library(trtllm_gen_batched_gemm OBJECT ${SRC_CPP} ${SRC_CU}) +target_sources(trtllm_gen_batched_gemm PRIVATE ${SRC_CPP} ${SRC_CU}) target_compile_definitions(trtllm_gen_batched_gemm PUBLIC TLLM_GEN_EXPORT_INTERFACE TLLM_ENABLE_CUDA) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h index 02c34caf0f..dcc25c32bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -670,7 +670,8 @@ public: size_t getNumBatchedGemmConfigs() const { #ifdef TLLM_GEN_EXPORT_INTERFACE - return tensorrt_llm::kernels::tllmGenBatchedGemmListLen; + return sizeof(tensorrt_llm::kernels::tllmGenBatchedGemmList) + / sizeof(tensorrt_llm::kernels::tllmGenBatchedGemmList[0]); #else return 0; #endif diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h index c4add0a406..6fee9be084 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h @@ -31,9 +31,7 @@ namespace kernels #define TLLM_GEN_COMMIT "0813b449" #define TLLM_GEN_EXPORT_VERSION "7.0.4.0.4.0" -static constexpr size_t tllmGenBatchedGemmListLen = 449; - -#ifndef EXCLUDE_SM_100 +#ifndef EXCLUDE_SM_100F extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin[]; @@ -68,46 +66,6 @@ extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_ extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin[]; extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin[]; @@ -331,46 +289,6 @@ extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cg extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin[]; @@ -483,9 +401,95 @@ extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8 extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin[]; extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin[]; -#endif // EXCLUDE_SM_100 +#endif // EXCLUDE_SM_100F #ifndef EXCLUDE_SM_100 +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin[]; +#endif // EXCLUDE_SM_100 + +#ifndef EXCLUDE_SM_103 +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin[]; +#endif // EXCLUDE_SM_103 + +#ifndef EXCLUDE_SM_100F extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len; @@ -520,46 +524,6 @@ extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_c extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len; extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len; @@ -783,46 +747,6 @@ extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len; @@ -935,11 +859,97 @@ extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len; extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len; +#endif // EXCLUDE_SM_100F + +#ifndef EXCLUDE_SM_100 +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len; #endif // EXCLUDE_SM_100 +#ifndef EXCLUDE_SM_103 +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len; +#endif // EXCLUDE_SM_103 + static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { -#ifndef EXCLUDE_SM_100 +#ifndef EXCLUDE_SM_100F {Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 211624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "bfbeed7c28f901d8813ea233eed1974f8178aebd01cd5e95763a96770c24815d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 @@ -4374,4046 +4384,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "b80363a859f6d8f5a769e8877feb9e5c3c9945d9a0cb8593b17aad3a04dcab33", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "a0f5b6b5851c8e8f8b984fb06f752251f705bafbcc1b67bfafcc6415c1d39aef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "4ba1cd4f665f51c7d916a85815e0c70adf4b3a40fd9491758fcfeed6c41330fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "8e4a92864264e883022cac435b7c3f54868a8f227d295fa1e0543110ce407a0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "58454e6a9381c680e0b7608d28c362d8f63bc318fd0908c3600c1d0e62f1a200", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "2c53fdfd3bdf91cf8913e2ad48af3b32cb65283997b3bbebb9c4d2113de2e8c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "6711a7cb78b94798d476e2015527f78a272e12c28601584a656989b57ec5462f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "dc5f40da99b04a75c8647fad5390bc4adad1dbfccb59ef1bbc533950f951f3f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "182231b859090838eae7617076cbddadd72b1c7dfcf6d05b8e693645d37be952", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "d1b5c844e6c38acd2bad7c412c2f0e1319f9b76c526c4af902d2d339a3181c37", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "37967b27f7d3079b25775e07ad93ff48ee743e530e1e1f30daade4bc79702c29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "86abb38d82287b384bc30b68834085cd1b76b2f428b612288608d5decaccd406", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "fd894052ddd3be8836eb3f9f5bf0e4501b594b169499681925c42521e7836faf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "55425892a71ed9830bbf75113ae99e5afa4d34afecfa76a809794440cb89fd42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "25cd93ef37a89766143088b1f3b6bbf02cb5a58b469fd60858a3442c2dc0746b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "cc88fd486b2ac28a4a989b0840a212ecfe8b4ed2cdfa65f5e5ec0ed0404baaf6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "2bdaf648e20c21147961c0de26d36cad1b004459ba96d2277229d644b8d7b5a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "c1720c19a7714acfc0f9809ddccfb301dff4a8b0f19eab3666bf7e877e696899", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "4ff3724a4e7cd7b4050a847a825c0229b35fbb2af6efe37ae89aad1deb46278c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "2133f1ca2f1e5fd5919bd5be40c35715b7e449b0086f863d54517e279508b7f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "d606a071c11855c2c44f2ed00e67cbe0f31e98c8e49fcd4d66ec9475808c159b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "a4d11a241c37cdfccd9b98763f1e617b93455fa9af2381f6652863fb339a8e5a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "c8cc6d888b1ec6fe1c442d9381c13548c5d20305f7eef33d890d7e8d3de0cc04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "0668fcaa93a3b9db892289b20c8e805319e64c43ff7331593a084ece4da0b334", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "23c7c050fbfb6c9add2266b6934269ab38c98aafb0f19d9a0d162e17032ee23a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "144dae2c3829b8126fe4fd649471ee4bba104ea4b7ae8bc3dd2dd0e00adc7bf5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "f8b279c3be2bb4bfc46e7882c5d8f4e5fb9617d43f0c9e18423e57b2ca27e827", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "65252b2b596f6f377ad3b3a63cfb14034bef1d08e11928d9f89337a0ea123f8c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "8d2c97077db4c7a6baa0a2f64cb3d2e8c8a92f13406bc48a3f7e25b89f947153", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "ece177649d7f32ffb8f04561eecfe9fc2a7801c27f87565b7625833b0f56f001", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "4e8caf0cb6dca05c84e297e98d8cf86e1ccc66a9ba098a9361b7de25bf0d96e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "accdd4b75830ecd977d8ca73de8467d78fc3df811f3a6ddf36ec1be366a6e6cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "f0137d289298ec525dda077ebcdbee34c342c148ecdbcc97694ed741b8bc9850", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "8b3d24b392aed12fde95fc281a5e2604bdac528bc458806d61480c0b7a1e0a7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "0272db6937e0556c1bf94ec38d5f00c3a018527debda2f251335faaafdd006e3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "c347838d3d85bdef1b279711f50321f580e82fcaba9e864d88caedeae8db043c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "e08f77deaa76a0580f6e7960c31dcfb9d29267a13c615b5d4dfc1383d5e02fe1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 1024 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "488565c97efcbd5d511d629c2d7d5589e74e844eb81edc99c78a4525415120da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 1024 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "e82554dc7bef6cb0fbc7c84b0cdb678e7e92f13a17b6de8b326322f69351b1f2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 1024 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "56d309e264a5b17cf6462acbf59c601edbc56635f20f9c81b131faa6e744bc97", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 1024 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, {Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a4443a925bfc1cfda47bbc3a5a8c39461ab0fa55918931e60b57901097244932", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -16896,107 +12866,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fc979100a837ef569d02e1c6cb1ee777b379ea82e19d393cb9673d79182933b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ std::nullopt -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, {Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "fef659401e81017fb7502ec1f472fe8449a3541fe092d0565cbf0061e63fb899", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) @@ -22148,107 +18017,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(0) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 192072, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "282f18924a3743702e6cbab5468e3edfdf8c08524781b8d66c32100a43a7425f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 2 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 256 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 168 -, /* mNumRegsPerThreadNonEpilogueWarp */ 96 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ std::nullopt -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 0 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, {Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222976, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6246b750671d1df893a6ffa76461b63e48d035ee2867bad6cd678f0335425893", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) @@ -30937,4046 +26705,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "1fe2a4fcc63419741b974a98c075cb6fa8ac47bcfd0c4697273257d4740e011d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "3a61b39374109515e1ade49b4907be59e20bca49c5f480cd6b58bfad54cf1a17", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "6890b924207a3c763c0fc2f4233ae126bd10b519b03ee6562b677ae8b3290223", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "58f8055300a8040f1b135c1ad6545e11a612ea599dae9d778cb15e2d956b2162", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "49ab452b4736dde55c25e388b8a3fabff1eeebb0efd88e317c569f4288791d34", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "f78cab2e7d21ea48e850bc3bc10bc94c33b12e03858cca10ef7bc86cb5213ce6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "15baf303961640c00748cc04411d9dd5260c2eaf9103a1a2fe9df4625f24a491", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "c771ffb7c2b3bf77e197e0f5839776491b793c86fceb2b212915efe3d0bf6d0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "39ed1add34b6ae815b9e5144d652473503f4705e5e4e691f11de0a6168be09c1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "b86092e42658328697f756e0b6299a3428fe9e909efa0d312e8917dd234fe7c3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "b173038fe82fcd45c74e0f543ffddc9b5a992e508427693c88dd603591346490", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "ac078c0e70694abd7747eca1966a3b540bca6653638d751d277957ae8bdf16a1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "d9114595ca5160f3364256fce3de22eaec00ff8c8c91c132c81760d431845626", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "9f0cdf7956ae297775db00157b8b12c2377a4ab03ad2b1413899367c45bf9b72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "7ca8f71a71517ece6d869214045ce026d088f02b78000c4023aaf66b220e7072", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "f0e809798f7d1e49e03251297864d15cb475b731ea1645f32473c4385f17d5e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "95a47449875996cfb3a083a3fe679ce1383a36802bb098fc630a755dd8c1e271", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "8798142071deb2b08e51bb9c48785d96535a4b8777a2dd45a27502d312eeb8d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "84f35f988e4ce8b3648fa0f136317bf1e6b7e7b61df943096b8b72228792a122", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "04cbae1f9840a6b2cc8370eee77ae07caed03f5e915cdfac3eba695656d9925e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "c7dba6c4daf51b0cc7ac03c3af2230944104706be390378b61ff5e96ed8ab93b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "cdf8266c614cef4089fe7ecaa599b7c3f94a539945b0bf7c59d1ee59c5d164b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "773cf8a7290cd697a380238ea6fe8b2cbf8495bf0e4836c0dade709d4b1be5d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "6dfdd8f20fdd53e462b29131837bb63657e24054151b244c5fdfaedf2de2e339", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "5e2396359abf0a28a42978c6b726f71c03c61bcbb3b6833555e76eec44e71a1d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "ad09703b01b1569440d32d8ff305f54c8d4a768ad4762457ef36519bc1e6c51c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "8a9c4dac0b5dcb6862bd595b021c8090cb6843c315b578360336e703484d7135", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "a999a6d584e92e81a65bc4bb11c7c3f6c029b5f006be0325b4bba6303136b512", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "bb6c1726223d7fcd282381ac83193b737a6ec3d8b478861c89d343ce6d17d7b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "8c0a812f3e5f883ceb969c21f2f89b20133e0a91774fa108287e9ed1939edc9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "6cecc3fbf44f1b439daa5612e8dd8ddba93647443824554fecb21901e2a28460", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "2e1e4f8bfd957321b3be798086873a82f12c7691350dcfd77f125b360a6a369d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "8b5951d2fdb9cc65e959d057a36b3541929a13e794357780e480513b06277b9e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "61a677a4b6bc423eed02992bcc263571e3fc75a83ef5b875fee35945ecf15a48", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "2acc6b18c473b2f36881d5da27d6c2e4a47c16c3176678e99eb22f4605cda0f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "a3d1435396a8edaad263c7c90e09004fae95765f47cbae1e06b21184fb0e7a7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "19d26ab5bdc47933cfb624c5a1a4af1ca14a24f5d6a9a535a2d012bb9b017116", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 1024 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "20fbcb015ec4f5b5d19387ce0a22929f4c6587f164af9e6bb575ab7591f40aa3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 1024 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "37aa83626f8ce4bff0b844d908c585f0d5834775c4f0e8cf7b85f9b4a8c33c98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 1024 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "c4eca7a82523b0a75b1abfc101f9cb6841de0707c3c010f99b6e8b67a78f6e96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 1024 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm103a}, {Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "4a42f3b6dad47959663b79eb2417c1c39fef701979c98b042a2ea3b2227d380b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -46289,7 +38017,8295 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, +#endif // EXCLUDE_SM_100F + +#ifndef EXCLUDE_SM_100 +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "b80363a859f6d8f5a769e8877feb9e5c3c9945d9a0cb8593b17aad3a04dcab33", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "4ba1cd4f665f51c7d916a85815e0c70adf4b3a40fd9491758fcfeed6c41330fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "58454e6a9381c680e0b7608d28c362d8f63bc318fd0908c3600c1d0e62f1a200", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "6711a7cb78b94798d476e2015527f78a272e12c28601584a656989b57ec5462f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "182231b859090838eae7617076cbddadd72b1c7dfcf6d05b8e693645d37be952", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "37967b27f7d3079b25775e07ad93ff48ee743e530e1e1f30daade4bc79702c29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "fd894052ddd3be8836eb3f9f5bf0e4501b594b169499681925c42521e7836faf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "25cd93ef37a89766143088b1f3b6bbf02cb5a58b469fd60858a3442c2dc0746b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "2bdaf648e20c21147961c0de26d36cad1b004459ba96d2277229d644b8d7b5a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "4ff3724a4e7cd7b4050a847a825c0229b35fbb2af6efe37ae89aad1deb46278c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "d606a071c11855c2c44f2ed00e67cbe0f31e98c8e49fcd4d66ec9475808c159b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "c8cc6d888b1ec6fe1c442d9381c13548c5d20305f7eef33d890d7e8d3de0cc04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "23c7c050fbfb6c9add2266b6934269ab38c98aafb0f19d9a0d162e17032ee23a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "f8b279c3be2bb4bfc46e7882c5d8f4e5fb9617d43f0c9e18423e57b2ca27e827", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "8d2c97077db4c7a6baa0a2f64cb3d2e8c8a92f13406bc48a3f7e25b89f947153", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "4e8caf0cb6dca05c84e297e98d8cf86e1ccc66a9ba098a9361b7de25bf0d96e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "f0137d289298ec525dda077ebcdbee34c342c148ecdbcc97694ed741b8bc9850", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "0272db6937e0556c1bf94ec38d5f00c3a018527debda2f251335faaafdd006e3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "e08f77deaa76a0580f6e7960c31dcfb9d29267a13c615b5d4dfc1383d5e02fe1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "e82554dc7bef6cb0fbc7c84b0cdb678e7e92f13a17b6de8b326322f69351b1f2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fc979100a837ef569d02e1c6cb1ee777b379ea82e19d393cb9673d79182933b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 192072, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "282f18924a3743702e6cbab5468e3edfdf8c08524781b8d66c32100a43a7425f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 256 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 168 +, /* mNumRegsPerThreadNonEpilogueWarp */ 96 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "1fe2a4fcc63419741b974a98c075cb6fa8ac47bcfd0c4697273257d4740e011d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "6890b924207a3c763c0fc2f4233ae126bd10b519b03ee6562b677ae8b3290223", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "49ab452b4736dde55c25e388b8a3fabff1eeebb0efd88e317c569f4288791d34", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "15baf303961640c00748cc04411d9dd5260c2eaf9103a1a2fe9df4625f24a491", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "39ed1add34b6ae815b9e5144d652473503f4705e5e4e691f11de0a6168be09c1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "b173038fe82fcd45c74e0f543ffddc9b5a992e508427693c88dd603591346490", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "d9114595ca5160f3364256fce3de22eaec00ff8c8c91c132c81760d431845626", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "7ca8f71a71517ece6d869214045ce026d088f02b78000c4023aaf66b220e7072", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "95a47449875996cfb3a083a3fe679ce1383a36802bb098fc630a755dd8c1e271", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "84f35f988e4ce8b3648fa0f136317bf1e6b7e7b61df943096b8b72228792a122", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "c7dba6c4daf51b0cc7ac03c3af2230944104706be390378b61ff5e96ed8ab93b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "773cf8a7290cd697a380238ea6fe8b2cbf8495bf0e4836c0dade709d4b1be5d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "5e2396359abf0a28a42978c6b726f71c03c61bcbb3b6833555e76eec44e71a1d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "8a9c4dac0b5dcb6862bd595b021c8090cb6843c315b578360336e703484d7135", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "bb6c1726223d7fcd282381ac83193b737a6ec3d8b478861c89d343ce6d17d7b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "6cecc3fbf44f1b439daa5612e8dd8ddba93647443824554fecb21901e2a28460", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "8b5951d2fdb9cc65e959d057a36b3541929a13e794357780e480513b06277b9e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "2acc6b18c473b2f36881d5da27d6c2e4a47c16c3176678e99eb22f4605cda0f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "19d26ab5bdc47933cfb624c5a1a4af1ca14a24f5d6a9a535a2d012bb9b017116", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "37aa83626f8ce4bff0b844d908c585f0d5834775c4f0e8cf7b85f9b4a8c33c98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100a}, #endif // EXCLUDE_SM_100 + +#ifndef EXCLUDE_SM_103 +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "a0f5b6b5851c8e8f8b984fb06f752251f705bafbcc1b67bfafcc6415c1d39aef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "8e4a92864264e883022cac435b7c3f54868a8f227d295fa1e0543110ce407a0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "2c53fdfd3bdf91cf8913e2ad48af3b32cb65283997b3bbebb9c4d2113de2e8c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "dc5f40da99b04a75c8647fad5390bc4adad1dbfccb59ef1bbc533950f951f3f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "d1b5c844e6c38acd2bad7c412c2f0e1319f9b76c526c4af902d2d339a3181c37", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "86abb38d82287b384bc30b68834085cd1b76b2f428b612288608d5decaccd406", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "55425892a71ed9830bbf75113ae99e5afa4d34afecfa76a809794440cb89fd42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "cc88fd486b2ac28a4a989b0840a212ecfe8b4ed2cdfa65f5e5ec0ed0404baaf6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "c1720c19a7714acfc0f9809ddccfb301dff4a8b0f19eab3666bf7e877e696899", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "2133f1ca2f1e5fd5919bd5be40c35715b7e449b0086f863d54517e279508b7f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "a4d11a241c37cdfccd9b98763f1e617b93455fa9af2381f6652863fb339a8e5a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "0668fcaa93a3b9db892289b20c8e805319e64c43ff7331593a084ece4da0b334", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "144dae2c3829b8126fe4fd649471ee4bba104ea4b7ae8bc3dd2dd0e00adc7bf5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "65252b2b596f6f377ad3b3a63cfb14034bef1d08e11928d9f89337a0ea123f8c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "ece177649d7f32ffb8f04561eecfe9fc2a7801c27f87565b7625833b0f56f001", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "accdd4b75830ecd977d8ca73de8467d78fc3df811f3a6ddf36ec1be366a6e6cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "8b3d24b392aed12fde95fc281a5e2604bdac528bc458806d61480c0b7a1e0a7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "c347838d3d85bdef1b279711f50321f580e82fcaba9e864d88caedeae8db043c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "488565c97efcbd5d511d629c2d7d5589e74e844eb81edc99c78a4525415120da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "56d309e264a5b17cf6462acbf59c601edbc56635f20f9c81b131faa6e744bc97", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 0 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "3a61b39374109515e1ade49b4907be59e20bca49c5f480cd6b58bfad54cf1a17", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "58f8055300a8040f1b135c1ad6545e11a612ea599dae9d778cb15e2d956b2162", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "f78cab2e7d21ea48e850bc3bc10bc94c33b12e03858cca10ef7bc86cb5213ce6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "c771ffb7c2b3bf77e197e0f5839776491b793c86fceb2b212915efe3d0bf6d0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "b86092e42658328697f756e0b6299a3428fe9e909efa0d312e8917dd234fe7c3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "ac078c0e70694abd7747eca1966a3b540bca6653638d751d277957ae8bdf16a1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "9f0cdf7956ae297775db00157b8b12c2377a4ab03ad2b1413899367c45bf9b72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "f0e809798f7d1e49e03251297864d15cb475b731ea1645f32473c4385f17d5e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "8798142071deb2b08e51bb9c48785d96535a4b8777a2dd45a27502d312eeb8d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "04cbae1f9840a6b2cc8370eee77ae07caed03f5e915cdfac3eba695656d9925e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "cdf8266c614cef4089fe7ecaa599b7c3f94a539945b0bf7c59d1ee59c5d164b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "6dfdd8f20fdd53e462b29131837bb63657e24054151b244c5fdfaedf2de2e339", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "ad09703b01b1569440d32d8ff305f54c8d4a768ad4762457ef36519bc1e6c51c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "a999a6d584e92e81a65bc4bb11c7c3f6c029b5f006be0325b4bba6303136b512", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "8c0a812f3e5f883ceb969c21f2f89b20133e0a91774fa108287e9ed1939edc9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "2e1e4f8bfd957321b3be798086873a82f12c7691350dcfd77f125b360a6a369d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "61a677a4b6bc423eed02992bcc263571e3fc75a83ef5b875fee35945ecf15a48", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "a3d1435396a8edaad263c7c90e09004fae95765f47cbae1e06b21184fb0e7a7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "20fbcb015ec4f5b5d19387ce0a22929f4c6587f164af9e6bb575ab7591f40aa3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "c4eca7a82523b0a75b1abfc101f9cb6841de0707c3c010f99b6e8b67a78f6e96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm103a}, +#endif // EXCLUDE_SM_103 }; // clang-format on } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h index 71152bc995..3031d47bc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h @@ -565,13 +565,13 @@ public: public: // The MMA kind. - tg::MmaKind mMmaKind; + tg::MmaKind mMmaKind{}; // Whether fuse Utccp into the MMA task. - bool mFuseUtccpWithUtcmma; + bool mFuseUtccpWithUtcmma{}; // Whether use the max TMEM overlap trick. - bool mUseMaxTmemOverlap; + bool mUseMaxTmemOverlap{}; // The number of epilogue warps. - int32_t mNumEpilogueWarps; + int32_t mNumEpilogueWarps{}; // Helper for SMEM allocation. MemAllocatorHelper mSmemAllocatorHelper; // Helper for TMEM allocation. diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/CMakeLists.txt b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/CMakeLists.txt index 7e1ac7d13a..3f166f5cb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/CMakeLists.txt @@ -18,8 +18,12 @@ file(GLOB_RECURSE SRC_CPP *.cpp) file(GLOB_RECURSE SRC_CU *.cu) -filter_cuda_archs("100" SRC_CPP) +filter_source_cuda_architectures( + SOURCE_LIST SRC_CPP + ARCHS 100 103 100f + TARGET trtllm_gen_fmha_interface) add_library(trtllm_gen_fmha OBJECT ${SRC_CPP} ${SRC_CU}) set_property(TARGET trtllm_gen_fmha PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET trtllm_gen_fmha PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +target_link_libraries(trtllm_gen_fmha PUBLIC trtllm_gen_fmha_interface) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b3dc19bf97..b232dac25e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dbeef553388e130f1b41304228f629807dabac3c5e0c66a0c28ea9833340040 -size 630932 +oid sha256:616bdb23263627aca4ce3448e32e9b47b59439aac8f774e6957415d1be92a6e5 +size 620620 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index cc6edba634..1bfd559772 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a90e201852a6f3c911e982a9f819a6be765814c39a24b0c9f53ce4dcb4d28c33 -size 565951 +oid sha256:f5b3750593eb35e8d451dc5842cbfb9cdd1e63345f7fda202eeff31e160d3839 +size 558895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 13b2b7f6a8..822da83bb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b3100a44dbe296bc51f7ff18efd9dd1b03649bd993a1cc54da0585208395ea0 -size 483501 +oid sha256:46463e24ad7663a32582c35cf61dc56da6b37feebefcbd5df51f2cf56b6a1a5f +size 476471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e968a81b6f..ddbf18d771 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ee2df4ea7203351cd5d12be09234ea39cb0a0a3d1f6a7d8f87790f5f02991cf -size 452197 +oid sha256:05f0e9c20a152b830151736c8fe075b786a2885497644ffee53c032704fb5975 +size 445981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c8cebee02..7fa209f797 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70bef448c376a3e57294f21afc296ea1e7a1f21bca158cecb995de47fcf7a9e8 -size 471975 +oid sha256:4cb6bc8d55b63523388260b43cbb7b5f32d0925103f1e04d6896dd6215129578 +size 464181 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1904b16b53..4a997510dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abc46c06eba3f60de4c29a4d06b672c0fd406f0b94e82c4828919c3ad173e094 -size 447801 +oid sha256:a96c271e3b91ff63d68a9fc187e1ff980bc80822871dd5ab803dcec488e4c18d +size 441585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 290496288f..5eb5eb563b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:707f1643ecd4243edbc2552b1c14e13154f659f9653122386c7d71fd97762626 -size 626982 +oid sha256:0b5202c7e41c5478e832ebfd56ebf374574ee0b643716dafe446c4943204307b +size 619016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 00769445b0..fc6c585386 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:949736949a7d8db4eaff5171d6a280fc75e7597e78d5cf53cd698bf58143599c -size 566023 +oid sha256:80486e68e84ed71716050db50073accd0eafdad79905f857f1ba01f82836d151 +size 558203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 77c78a342c..7924b8a54d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:620338e0232c4bc83022026e927ad4337b3438bfe5225a771d71ae821708a67d -size 452573 +oid sha256:b3df4cae191465930320aaaaaabd302040f41d1a17609902dd866656dbfb8aea +size 454251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e1105a4183..fec3f307db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:501bf6e66db0c3a774fc7505841ccf5b02da306e43522b0a694d61953cd56864 -size 394599 +oid sha256:04a36407b42e0d90fc30a20a21fe5fbfdc70c2fe64d667af6dabe5939f6d071e +size 387593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4fddf6fa16..d8b174fc18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0f40f5a666044cca61e289b9c48c98977691b9e1d32a72179128ccbee15cc55 -size 436291 +oid sha256:39332cc0ebc5c940feb6821101fcc757100d5b89281c0ae3f588ad5a662dd9d3 +size 437179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 131a7372e1..9f35504414 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c59673a8be4362f46f2e9257ec7c4e38a4800c45b1945f9df0cb32ed0c6d43ae -size 378317 +oid sha256:164593e1d5b88e251c405d70361ac4e74508e6ce65208ab3dd6bf7976350a103 +size 371311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0179359809..34cb4ec580 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34c1fd4fa79d1384876b968ffbf432fefdf981338a7956e296799295d1d0247c -size 505637 +oid sha256:2b8887fed8b56ea827931b12e2452ac17901c674fa9aea0b19ff4fcf0dd951f4 +size 498607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 67d3cc1421..0f54ff856f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:055d697387425f909c5329b792c5db97a58cc847cc7755169de50f99d0ed5f73 -size 468807 +oid sha256:cd9058dd7b9dc2d549b698a84f95b868b51907d0ff517de6b1b25802f148a683 +size 462591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4911a5924c..04f1916ff0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60c0b37d90c1fc5e3cb27e480d84d3b18d47cf7b1cd4e3a3af35f941100e54aa -size 494111 +oid sha256:d36b4ac976263bf8ebce2b478021cea027568e7f93e9275e6124c5cebdede87f +size 485527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e1f14eded7..4687fc5fa7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3b9f3aee4f1a1d79c4876a4fec9edb0baa4c78efec3c76c04ca8bd00af76963 -size 469935 +oid sha256:121ad8d9cc1eb1b4571cdb6da67c95a8bf94277fa3bc1f959a5b2f2fac9f7cdb +size 462931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 15cc8270a7..5491f0a396 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f417704dbb04df29ad9019c106a1de5aa4f4415e7b7aa206600843b30645000 -size 670528 +oid sha256:15829041f11b292854fb76fed462349c720a0ef30a88c24bb6d1c33754fc8490 +size 663598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f2f547d237..3f6c0b55bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b73aa21041034ab532412a51cf00dda4d8ffbb5d3970628dc5fc9fc0dbd39e24 -size 607571 +oid sha256:8e15720366d0d135f951818421edb4cef722909f693a80a8a5208446d9907010 +size 604167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 158609a3e9..aaa609231d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfac7eaa6285ea9b8e02a1ac7318a14a89bd04dfd9794d194c55e45583aaeb1d -size 481715 +oid sha256:75f42cbb4afe3e9d57246039bd2cd711bf65fd95dbff64b38242f493d9b7d871 +size 473919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6ae979c279..2439c93f65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b18b4478ee849f9f92ff59f68887710b0c9af43d57819821e64ab81b7c8fb1cf -size 411875 +oid sha256:3eac4d690bc59e2024c92af584bb6e0f29455e3e4575efc37cbc96e3f3263a0e +size 405659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ddb87d88e1..09fe459cd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b39f08d8599260d351b6df4e142512bb75dbaa219a4a701a26863c0566d3e72a -size 460697 +oid sha256:a3c937220ee455d17e5703bb35e82a00b27ccb12bd8861e15be014823d5f4619 +size 454481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 414dbd42b4..c0d0fb0b72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59497957d8e27c087941c6bc5db6dbeabbd0e02f98b3d359d0ba347698927f8f -size 394015 +oid sha256:8ef74aa9dabff7601631ccbcaeb5880c12bf391a1f30cd830f8bf272a218a484 +size 388587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b77a11808b..f157fe85b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55485e3210691cae102219ccc7c6dc4428a79a3115887447e8d584ac25103cf5 -size 663022 +oid sha256:9f017070eaffd5f5289a69cac49ace13da7f2cc0da94f6576b6fe38f76380bf4 +size 649580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 86d2475f54..d5134893ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fd46e418a1d9ca3563211e581bf352207e88bef95293f71a15a63f516e008e4 -size 574435 +oid sha256:b102a4b49b61c21a3b1639159476fc01b977d10464bcdd50bcfed77e5ea0ec9a +size 565703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index eb169b5463..e463d77299 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e9a80632812c1d98e05131359d8d0282a0219766f4f949257804e0da566c479 -size 593219 +oid sha256:3722c508670751474ea923f7be4bb7d4bcb4ffcdf72168b4c8abe5f9921e29b3 +size 586977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cea57988a6..b646d1d4b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:587b63d2d1e7fb17f0ff8ab7483e4fcba28e4944bcee169dd5d5ca2909b567c3 -size 569019 +oid sha256:dd3e414313b8aa48ca65260a855ce7e58266eb6603adf67aca52efdce74da394 +size 562803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f8f218e683..8d837fa165 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7584eb0eb94587d360419bf58a4df610a61b99069b044f82c9bfa8669cc1ad2f -size 580115 +oid sha256:ec71cbae8f5d86c3e8149b4aa095023082dc2cbcf465e41254c40db711b20566 +size 572319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ec0489c79c..d1eece88cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a578404ca82c1fdf9be383d49d0f1e7664e935088fb33d2bc1bdb4baba166d41 -size 559885 +oid sha256:8345d6335559c4bf863948ac445bb1d6696aa38ae23cdbbc392083c329324064 +size 552091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 75a42bf3d6..08b6fc251a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01801e6a36b2ad80cc9bf4ff90586b853694271cef751f55d2c5ba5eeea401d8 -size 663760 +oid sha256:f25796409ecb2803e61f092484b88d7dc6b58539b77c48b59c69dbc11abdcc52 +size 649578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index b896cc98f6..30d6514190 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d444b0c26d99f3dc92f58a172064f3f7c21252164e9cb127f6b5ea3cfd8f9af1 -size 608499 +oid sha256:c20f6bfb52e6f222bcbbb202a4b0fb0aab4b95e73634853119141b4ee4f63a1a +size 596609 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 74481fc1e6..feae20218a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2de5904a5271e374185323f2029b11fe07edbdf22666a562cec4e9868915e57 -size 558345 +oid sha256:be293e8aee13242adb29ac239f2ff332cca328949a69b0b3815e1c8116233d9f +size 559233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 33b71349b2..a22ef8354f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4bc2d2f470b216d493ecd2a9d559fef26d27d34a5c9d5a44e5b890beedbc6eb -size 501159 +oid sha256:b585477230f60b14873d5302711770bcbeff7410398792978f69caf37ef49b56 +size 492575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index bd6262eb1f..511f20a461 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bdec3801521171bf2fc0442ccda65a5d66e36be28ec620763974a7fd8d51c1a -size 539695 +oid sha256:35f0da62936aecf7dbd160137f5dfc408284082c5e1751e21467ade0ef502969 +size 541371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9ba2e2b408..c2ec60907b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e29787b3c4dbe5adb56c163d3e8b89191afd58615add63acdedc8a760ca3814c -size 483299 +oid sha256:32a0d306e94f90acd490969b36f28df7f3b00a91a25e1b97d28296d2505ec54c +size 475503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 194042c056..809e76709c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0584f5e2953b9827bb31c1533f9e8a36e2998bf952f30c87630c2db48362b6a -size 616933 +oid sha256:ea93243da8bf5dff7d14328f5c13def92bd011cc25230fd7dc8b0e5e71e04d77 +size 610691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8fd9ffd63c..393e686bb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce98ce92ba5132756ce4b5bd3ec350e47b80ef06e573baffdfae212ba0236d57 -size 591153 +oid sha256:ae33d408c456f0db40410e49c5575354ca2ddee7aa66fb64ed77083d5a5055f0 +size 585727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 907b7e3dca..88d2a19b18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be20a25127519e3e4816c0722c1868f9afd8556118d0ede9668a7313b781c0be -size 603039 +oid sha256:4586d5bb50e101f31f248608789bddcb0bbcb80abd828db5c6d531b3d527dd7d +size 596033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 25249efd70..06dda9aa31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a00e5f26406e9cfcf828c84c9cb054f096bf96d238989ddffef1da097243bb49 -size 582021 +oid sha256:64eebcb394addd848d705a111224979e41e202bad6e10a968ff246294c70ea27 +size 575015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5d2b6b37a0..d5fe016d0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf56cc695ada25ad4b2707dadf77d0c98c3087e2f52ee6c89b2a9d59d438cc8b -size 702670 +oid sha256:7235264cc8742a630cb0ef55dce3b20550fcbeea1292af972eeaac882afb3234 +size 688486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6f579f6557..e1fe98c6c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:350653f2f4134348c19bdc0460dcff2f693023b783023a4ed6a1eacae1715dbc -size 617191 +oid sha256:3b78df7d072083125979fb3e517a2bb564e6aec52e256091c127fcdb36f63a25 +size 610135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c96a2b12dd..f03f2f8994 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90dd023f70455b0788425890b8cd7b869b21f732e2073d6085a60fea86f8fe10 -size 586695 +oid sha256:9eb18afe39f7022496e7666125b68a6a83d1a300bbe88fba2c62584bbca5f952 +size 579691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a8a3930e9a..b36ef01d18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b77f111c0a72a1c478c4850102b729dba22c7d28ce72b950af96e6ca838e36b9 -size 512909 +oid sha256:123b2693b236e75a98978328bc5e84c4d4155e82c2de1fd693a61bb284b5197c +size 507483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 92f947eb1c..111f4b7435 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbc91114ea6283743c584a2c908a431d0339d1f8f3c1b62d484e2147a6c91688 -size 564889 +oid sha256:20d78ba25b66081b8ca9bc81ac72d3db8700c35c71c5cbb7981cf03ffd723872 +size 559461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 764ae7da7c..0d96b0673a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61472723f538d24c9acd2715d33bdec5b0b398c8721f760b2d1487a18f9c56db -size 494259 +oid sha256:f43e322151d205cb7be44aa960e9c058b1130f746a0554189323dab1c088366e +size 488833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9275c67885..0b96d5ce55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d573460b47edb6494ffaee16a7ad168fc838bc1f682276dcf883f3bf87dacd8 -size 561491 +oid sha256:bd3c554d3e609746f5c88052f610407e0ba7a77d218b98f23d1700db288d3672 +size 549849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1cccd823b5..c731a70fa9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d04d9dc89e714b2fffd0e2c3584a5acfae5d8f09b372d73331bc6bbf8ff5e31e -size 502949 +oid sha256:303745a8ad276cbf903057d90dc83c5d5d187b9a7fc8820896460789c04a4d55 +size 496463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d716d06c68..bbd8dc0322 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:357f912b4b3932fbd1715ff5d3630cf0f57ad1ccc8345319f66f59dfe64a244a -size 456883 +oid sha256:322e38aed9acfc5ba5083010671a292a15866cfc7ada0a1ef04ca08d11402e0f +size 450619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d512b1c652..3d21446a4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e50308861808e757f76cb664d4834089006cb5897ca20ca7c62047681b38d58d -size 442723 +oid sha256:ef75d9f8b4c1e439803fa420345fdd27884817117e761f3d655eea2f1d300487 +size 436507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2dca924eea..e039d9fc1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07088afdd4892f3359c43cd71a036eacf99aaa7a61ff5be89cd1a14380d69f57 -size 450859 +oid sha256:f7ba48776a57bee1eaba279f88b455d71830353dd0f1c17982d38534ccc7bb79 +size 444643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2eb05a48f2..f690ebbf34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd3ae620c3742f516c4cd35b72773bc305e98b98b1cc4e63d44d5083d30f123f -size 436747 +oid sha256:19cac5380575aff8a39767424d38b6b463b006bbf92927f24c96da6593cdffab +size 430531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e9b037bea7..c14636e033 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10dfcdeb3f26e142320f4deaa7d3100d3b411fc45696bb21232a102b0694efc1 -size 558333 +oid sha256:254581db67314ea198da7aee258596f04f6def6d01a41fd1ef71705b04087e20 +size 549083 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index e86e2bafad..9a4a2efc91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bec6c7e277fa9f46c6e6650bab9c17611dc4cd1669b4d75211d233db672576b9 -size 502233 +oid sha256:18572512305e1229566e0a6eb81f88f22fa8dae442f08cfa419334b00b89e980 +size 495769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b07bf09928..b5e3b5e5df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b6ed54b526c428b08dbe9f7e80e7673fb8a2d985ce065d9ace6730c44dbcd28 -size 441719 +oid sha256:16bd9568835544733fdff2c27b988f109b15c638ab04cadf8e672fa9831fba33 +size 442607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2dcb8c2b4f..a6f2fccbb3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fee499381c761edd8ed3c707e9e86af26b7d8112b7285d8562364445b4db15e3 -size 384533 +oid sha256:7ec6566eadcb6b1c559e140b150aa15380b69aa3a5ad6363a6f2f55e8e6a66bd +size 378317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7a724f3e6b..894d2ce823 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:527451a03c38f8a12a2fd09c911c4ea54cc89bd7f688108ea30694f661bbb5f1 -size 424449 +oid sha256:a97ad0f83edda84db155ab182b6805ab79f1b7f1f3b9a33717aa14ffcdba02f6 +size 425337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 810656c2d7..b6cd4221a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e3cb444f826882d1f9fdde1d49bae9235f36fa683a9ab2b7879f82a4207d689 -size 367265 +oid sha256:d8b3b3bd9790a04368061c248476f1921afaab33635117336f4fe3f8f7621e9c +size 361049 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 376b5d33e6..3fbe2b9479 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32f4a45f95b8ae84a0e3ae5a43b3b5f5c7362b1ce223819f516f4b4e73170bb7 -size 473445 +oid sha256:471d613cc4f330de90ee3adaeb3ca60311cd5f5566f6c186347954cbc27f9536 +size 468017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f5fe8694b8..cadb6fb6c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d8a3cb32086a1867cce707e2300b7dae6b3f47db799a77848534a205e6846c1 -size 458543 +oid sha256:3667df7be928d46644c990d5edf800b4267ca16d2b1af246c2ff1fcd185aa63f +size 453117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0c472509cd..422519f452 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa25c1b87150f6db0800cebb0d799ec948d40e80384c5312eedb52f73a1d6618 -size 473783 +oid sha256:198938eb3da07564a649fc28ae9a1f347c4aa3b3b4d94c7df599e164c7c78153 +size 467567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d9dc3e80b3..21728038df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89c8a875579b5d4ff2f53c6925a71f5f382ec4e0763dd56ec92423aab9166a94 -size 458883 +oid sha256:dba544890e95b2fad732d7de898a7bfe6bae8c9e91d0655fe7f092314a6f8956 +size 452667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 747d330547..db61d40050 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a654c16808084190287ebf34535e68dc1f4d3d1f3c082912633e372f35df0648 -size 601089 +oid sha256:817cf4afb5bd708007464a22a2ae0ce270807ba2b7e5b259d1613f9380d4f9e5 +size 594947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e89cf9de1f..b8c140ad6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fdf46c99fb0c98db2c447a5e22075c9f3e037665a9945e61763adf2b3f5824f -size 545605 +oid sha256:ce202d4c5578e1aad38916fb24e627f6f217c727bb3cbb201d268a59d6f1eb17 +size 538231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 291316d67a..b79fd75a00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04b2dd3f1d6ce30d084c944caf3301c274424a004097efdff1a345f72083dff8 -size 470069 +oid sha256:8c14dce9f0ef34ba1bfa4dc4adc7d99b304910ca5fb785bd41b0612f9771c784 +size 462275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d722dac273..6cedb55948 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fb4cc06f718df560c8de15140cabef4e98bd1d0f35e186cf37d5e94a8256863 -size 400231 +oid sha256:28c0a61e45a428546baada7385da98a5beba03372ec0bfd4282b8edd305ba7de +size 394803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2c907f9434..5741e94d8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b1d5f5ed80ed7d8d0c0dcf465f31ada29aeedaa85e4da693a684a5cf940d927 -size 449643 +oid sha256:dec9c27d8f9b4bd5db6f88275274c94adbd30ea2ce8220afe8fbd8b4f0dba5b6 +size 443427 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3afcbf5251..b8559f8aab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06c57c87ed62e80008fb18fe5688f1f633c45aa0cc1a66f826c1efac62af8bdf -size 382961 +oid sha256:d1c92e1c9d473d21e3a9bf07502e01eef0903b9183ee409ff40bfa089efa5ee1 +size 377535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8f55d3dd36..53e3c740e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2568162532c48e842319952122b3835e4ef014861cff4f4e37491d3703ef58a -size 749338 +oid sha256:286fce5fedc98c25e31b2aa6f9e527364b1f634cebbcf4174a9e5c6e86d73226 +size 725312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 99a07e2bcb..c4305c6ae0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07064296444d7ac6bfff94dcc790acd605fef68a760cbc5ead3296c880789567 -size 661516 +oid sha256:1e8d8a2baf9baa1eae8674c5f55a26346e6e004d90b0d035179de74362b79275 +size 636060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 551e3e18cd..ef47b18a35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75be8ecca1ec7acabb23fe44d3cd08fa50d46a2600a921c8e2197ff731141607 -size 776790 +oid sha256:aaca490b5dd9c7990f603c992bb750e617d927caceb019397eb8eb4fa44194de +size 749312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c4ece325d1..d76bc77079 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b354769aa9f5659b436534f7a3cc4aa09a493858d43ab70bdc316f6b0d88d512 -size 688476 +oid sha256:645a80c48a4be552dc052a5ae103e69cf8a3b504a97d2a5b235005da1ad5261c +size 663414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 7841edce69..2f3dde3bfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:883f192f66396084fb325abbb832687f2f27887c0018a970e39bcae575b4aa95 -size 753232 +oid sha256:7922a772bf56cc4a42e3f8fa5ea6ea26a20092baa25e85254820ab7ebb872f0b +size 722004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 4c159437aa..0e2a48c29b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b35e56a59c42ea0b6f30bb49076c0f93611fcb00cdd76dd4353b465adc0c75c9 -size 661810 +oid sha256:511da0ba6a3dda888cc95df62793f89fc236b630df0bd8a4e41921058a3ee1ec +size 636650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f176b5baf9..be2f3bc77e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dec9396cbb1bdd3ca6dfd831b9c957ad2fa31e0ddf2a6455b91b6aeab6c7d06b -size 773336 +oid sha256:72c6ccb1e756e7429685f0e898e60b3b680b0893638054331b88322926109f36 +size 745610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 18845540b1..fe187cb1e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f445f0ff16b08249c7700137d350ee6b1964ddb183bf4d5646852504a38dc61 -size 688524 +oid sha256:f59be6bd1031842c71104dd7b520c66b661b24b8763dea1d663e0cf58fcaa9c7 +size 664696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8e4d77280c..bfd383abb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98265cb4cc56fe7336995f39759b8a4dbd6d35b68f9c934d746ec9304c16e488 -size 818090 +oid sha256:e91b7bac7cf917b6003e608ef39fc92f5b7ae1ee6c4e16e56cae1b572f5f4781 +size 790118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9d604e8e96..ec4edcabef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80eaf7e695a9144f3e9ae58526dba4d62e19c374e3e3f5191d4216e2276ce68b -size 731700 +oid sha256:4c52649dc2f57b803013a52f84db1d5789258817b6c34ca17917e1c38b94a64e +size 706540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b4c838d518..ff1b7cd5d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7919d7baf0bd29baa550b4c9331cb5b74c9184211648a2ae4309956fe4b707e1 -size 844952 +oid sha256:652213aa98b3b0141802acf20310dcf7a8f24a05e94062b4e8f82b3b1f272b38 +size 817078 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index f690f28ceb..3adb5145f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9de7275c32c73482f2ae5e5336051bdbc172dfe02cd35950535363775a97636a -size 760042 +oid sha256:4d9521c6adfcd41ee2730cd1e0ed4502661955c2c2b0a5e6355d150b16492b0b +size 735128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 31a6a06659..be473f6ed5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5be32375b4f0595bfd691d4b01e5851737bda01669dcbd1a101db1e7fecb7f8c -size 874036 +oid sha256:2c99f44a0f0c435436496cbe1d498f185604b701ea8fb67ad1a0e2c8ee9cc444 +size 864762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 307284ac13..10a08f4de6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83558fa89e1d26bbf51093c42c3cc3af31711cb555fb496620a95deed45345a5 -size 706006 +oid sha256:09126ea87e05b001bfdcb4b410ca492a00f7f9b29cd63b021153eee0e582f8df +size 696188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 323b8e23b0..531ca2bec7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e9ca086d9b2d1a1996b6b948efaeb61c609f8c052b901a1dee4e3992220784f -size 775714 +oid sha256:bffb58dea79c2d285f3c70d5950a6e6af4213674df55ab97aa3bf597bbddbf6a +size 763528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 00b5b927ad..d2592146cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9849935550b57458ebbc1d8cc12f4db1b05c33af085aedd5df63b3c845ad342c -size 773198 +oid sha256:a7d0c270d8b10533914550d7b0f251dd0ad666910bdaad35f2aa4cef257df932 +size 762986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fe87012bfb..2da88fac5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42e8365c84d53022a6ef4ecfc0ac76a079fb3b4637ee7ebdb43d2f16da0f12e0 -size 660370 +oid sha256:f9e73caf1128411b4154623d89c0b18eeb08eefa1c83b0c62df124e9e035f885 +size 650306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..66797badd8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:586cb3a876e522f5968a14b01f412fdcc5dac8d4d14d1b00721b590ac02343c0 +size 919842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a22e0bea0b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c38ae728025373804ca8bd535fea64a34bef25b3937d0b1c09beb5a42d7d425 +size 599913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..99ddc5307f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5833c71e1fe3984ac1ec0f9dc51c18131db4bb408760c8144e406f8e9118a053 +size 657042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0c2e2d5ac4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b217cc222c46a69212e3408b94d7dd90e97e14cb7ddad07d520b295fc11989 +size 786344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9dab76b246 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df279dde0c4be2f933ca078a633ad278365853f4ce3394b132b67687f34d0482 +size 558865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 00371ebcc5..a5bee933d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2987e51b51870e7bcfdb3e631f4cbd0f5fbe47f9a8fb8f219fb6f716d11c27a -size 834318 +oid sha256:707c90f34cff5ff1eb2efa9fb19f1167d12aab1120bd2c38025ff8682dff53ed +size 808220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ab4ed62e0b..cfaf422288 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a05993e27df6d84f3782bc18b4d82ca8b4560c3c6447fa0d18a549f7206b42c5 -size 694160 +oid sha256:a0930b434c331a06503648130567fe9deeb7ceb88639011c363c923990c9f925 +size 681628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 3e605d9d70..4cadd8327b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d87ad519f98c10ec7bef4c9c77e6d7dd8239778005f7808302903d900ce00e7 -size 753162 +oid sha256:932e25908c12192a9df5f99196207a383e0fa946ad564109e224a9403b9e0671 +size 739052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e5be43f9ca..15d354da7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8180e747e241a2a539db4613fa33edf0beea927ed8c8fb72a4afcdb1628ac841 -size 714978 +oid sha256:acf9ffbf5cb55d97f4f76bfdc1c3ed88f062f44e73df5d43b8610b60545c3465 +size 702792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index abf2b82c00..08d7542a36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91f1f04bb85721ed4b00dc30e63ad0facbaa2d72514237fa2726adc9f5f78701 -size 653654 +oid sha256:93aeb7f8f547afc63e42bfd46bd09c44bec825c9d066e7056a2ab64c58738449 +size 642850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d2e87ef8e9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26aca55ecae1fcdef4dbe3f0803ddc4b01898946dee5c87b622a2d59339e0f01 +size 864090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..144455e000 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ccd5fb5faa025c6c0ebde753472d0268ba1ac21457f449e5c14bb8e5c2afaa3 +size 582195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..11ca96479d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9da9e8291d86f48716a6be6609a8f3643bf6cedfb0c28c6286d026c4cf847bb +size 632516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b31964559e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69a6458ae43ce6cd72d966b386c95a5c5b065e36c2c33180536e0a0fedddc346 +size 726200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a1fde65e59 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dba7fcac1eb03fc9ed1be7c12b4b7a62abb32f0d7a60f51cfb9266ba26ce6ea +size 550965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b5f25509af..8d06af39fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a44bc023a76941f7445520e260d0c0fc8760063e0d8955b19a83574bae6075e6 -size 848746 +oid sha256:5c944417f282260a96c06bd22adf9110b10c32bbcfa100c38cdaab8941cabf7e +size 819984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index dd844a4057..4752be9792 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fafce52791f2b4abe59c1bcc716eaf0b3098d5f713a527ffb33e39675e976f96 -size 825966 +oid sha256:95a38bd143f67c3ec40b03892dbb3242d30fa175b50332d50d9de7cc1568452b +size 815112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5fb8f7ac65..2d3f03edf7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbe104d0f5e5db983d836054f35cd13ae589d82bc1037f0affb874bd7dab3648 -size 754906 +oid sha256:6cc2e734d1d5f0c6b63e4d3a93d586e6513379d655dd9ce4a7b04cea6ab76731 +size 725208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7e2e44840b..fbcd263309 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b90e116f2122aca36640ab90f96702da4462bc2736ba0665e57fe5d39864c34f -size 730794 +oid sha256:babbba068907efa8428ff38fe3159052428d26d6d54653f3f01df17d6d97476e +size 714810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index bafee37e73..61c907e738 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16a8725bcaf79441df397b8d8b68150c9eff705b3f1672c19cf6f3054faef8a8 -size 719058 +oid sha256:677b6d0385ac37c3636c917038b7e8dfda00fe9900cad17b10513d8595edf1c1 +size 705936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2453ae7ed4..a59ace7f74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc5ec9515f97246cfa819c8612db5b0f61a11ef92664d88b376677dc83b16f9b -size 610517 +oid sha256:973a9dc63edf16ad3c955346c63b34231b87190845e04b7c10e96bdd4206fbb6 +size 601685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index ab882126f5..81e91ef312 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22ff4060835564fc0a59b887d6c7fc7f09acb22e280d39304cd6ce2226374002 -size 805738 +oid sha256:a91db3291b9aaa60f229b636d737b358fff09dc7fc40bb82ca2b1ef506cb38e7 +size 791430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 00c1859507..fb98625d1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:590f5ee4d4d193f83cdc6a53206a928c1f0eeef0d5f8ccac351996d91ed6cfb2 -size 691178 +oid sha256:c0ab2daa096f198bea8ab024ebca83a1ca923e8d3b2a70231072f3b5b5399370 +size 681064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 64138e2f53..d69b96f66f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ff69794880a0f1144844bc8d521c524e2ae134655371fbd3db37900b3d5c13f -size 720982 +oid sha256:c21f1596d900e557ba1fec8f8ac790516472dcd7fe6c8431d8512c0c4e286877 +size 709932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 7b4a50682b..f1582e515a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e01beb1a45556a3c584ff9fcc2d5a8b73a131104e7a3ca4221e7bb486dc9d95 -size 619298 +oid sha256:7306dacac68f24bad097fc3adfa11db5c5432d2edb9e618d7a15c923d4f10c17 +size 610713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 84c25c408b..6e6cb62344 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94b49a1cbd0517790d651a87cf311e55a39f85379dee8753ffae750369dbbd83 -size 658228 +oid sha256:a596527973a5199ea55d95a1178cf76f2c167c334cf34ac3ddf571b1fd877f39 +size 658870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a33473f89b..c205daa3b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00f733e8db3ce48fb833a45491c678f5c754398fe3122b64d46b84a52d431017 -size 565571 +oid sha256:3cd694d5bc77aa5b53592279fe7a344a2a875c80bf4103446112adcf32705099 +size 560243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a1891367f1..ead49d4296 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e866754a3c408508951f101c0e4b41ea5a990902455bb83a435f1603422b038f -size 873782 +oid sha256:5640e2cbeee76b31c6f0ee68947120b4aa6530eda0d01375d115bbd8b3f15894 +size 843936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..160e63b446 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f9cf1178b397a686ae8d53ff82c08960e4abe3fc926106a064551584373f85c +size 854160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 0f27531e5d..d77e047ffb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5c2cdad481385c8266024300e99a3828ea7482c825d1c8968e467e75d26ae62 -size 778512 +oid sha256:d31a3a0d2d56ca1f8e9117b3f6ad3658a9e5a240f1192219e6a4982290aea1a2 +size 753352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0bd96bde19 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7053f94610510fd5ec94ef1afb83fdc1f0abbfa06d93b99eccc354bd0d054e09 +size 770778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0a037832d9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0db0d0fd6928704f18cfe186d660748e3c9c51f2c0a3dbcda1362ac40deaab1 +size 620416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8ff90fc048 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4ff5df477e9d0bd0b5abda32146104f04fdb0b930fa413cb5082d1570eed12f +size 517399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9f7fc23a11 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4050d2169d974a22a148c98b332a95b0d7cad0d819c1001ac50b14a2bafae84 +size 693084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9514707ca1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57da729b5ca9c2e740c6a1dd9f846fae99590ac401ce056df5cc7168d9ff4630 +size 582025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b9a6607134 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed25237c5438361635ce69d138fafe1834405de8dc2065f1c39d1743bdfb1c6 +size 734870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ea405a8aa1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eddcea7e506f72d5d90447ab30a7ecbc17e94b691cea7205716f4bd28c6ab47 +size 634074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1be974bac2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdca70480ed6e650c47bd43eef098fd067ed1d59945b5cc770c96c62904110b3 +size 571573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4e851c90a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59adf3d054ab14d914ec62501652b0d0a19dbb96d5a40e2ea27b132330116928 +size 480051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c8dc091b87..565cbb7e9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecfb062f3173b9ee9b2d9de67c28ee7b6b2fc54ee487f1685d7d4373e521324d -size 868708 +oid sha256:191b4b50b17477f9d465ab47046a2b9b664f2427e95c6b89135bed902d676979 +size 861012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3a2f62a948..0a187f9c2a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8afdd1eb466e922bc53e1716566d1533372bec75d1e29c7d11ddb8ff9afb65a4 -size 908200 +oid sha256:b3d97431248e9034aee6fa72139e78bdc9f2c10e4bb56734c760b7ec9a6a59f2 +size 898580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 39b18ae90e..df69b72348 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a791d2b253f9bb3d57b2688cce15acd8618719fb2d39f3397f1a8f3de7eb2bab -size 834810 +oid sha256:d4841bf30198bc2ee669d249622fbee509baaf2a54f3eab64ea6a15b6639ab9e +size 810440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index c2c3e7d9a7..cf34b39495 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14b1a4ff679f5533da00c99747130d663059258031653f688912935c6964044b -size 873512 +oid sha256:6ee936b10d1840a2f7f15d02343eaddb8e4853b2bbdeee984ac666d5ca928688 +size 848746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index b2f316fafe..d874f28583 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6785f8bec42a6a4d110f66c16511c87526f488137bfe5b1f802a706ec5fc00ab -size 811068 +oid sha256:9604c28259e4c3fab50963a26331652a10d4c7e38c187a0a29bac8813ada5d55 +size 794640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e28515e9eb..5f26d6cf15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:daaf62108190b129458be2a677d8e5acdceeaedfd6e9ba34f5a4efd249a3d20c -size 721864 +oid sha256:393db31908f0fd9c0257ea097974369a5a01a4f43217c46e2bf8b5bf92a83f2a +size 705732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 0b9d15be29..3d0bd19c3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac719101e6277459fe356b5fbc639648cf54e60938991edce9f40ff966a68379 -size 853024 +oid sha256:c216c79f4a932fba5bb4cd0f8b44a503290a297270703c3039e5228f1a21ef8a +size 833094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3348fe893a..6a51f8a84f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7254b1b507a1fefc6d72cc03951baddeb05245777b5c5a3f35817521c122ccf3 -size 761504 +oid sha256:fb725e8291b6b5bce9c5e7bcf80a923fab1141275b98785c15da7ca8f1281706 +size 743052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 3969f5b9bf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1784583d9f22ca737eacd69e82bfdcc91c86aa16382937aa736eca54d6ee5803 -size 598579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index b6533aa0f1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:725eca0a4c0be1060a784a222ea689be1c6f4c0322a78a2cc294dc4d2dffa994 -size 558765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 838153ffb0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5809af60b8c4508d4d2c42949f453e45c7974fa3121a78a463b1d33d38fcd99b -size 585301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 7bba3466f6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:58932c38a8501a22357c2ec29bf13fdd04c3d2f7bdb899514dc98c4d43e823d2 -size 550075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 592e87f30d..d14cf6fe40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae1455f120f347e608b644c583c1b35fd215aaba738c0cd0deb517d98df7aa1f -size 846870 +oid sha256:db69cd2d4fcb7f805a19b13af6982682707dd00a957f951880a8885a65941452 +size 818798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 13b34de9b4..65d76f5d98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd629ce2cf8e20712da97c91bf0e62cad075a117f895a603e59d5f1e0460dfe4 -size 757322 +oid sha256:3f7ac626aef9a09268f098a0c2afcd7a0ea2a87e79d538cc7303896cba2bb88a +size 725798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 03433ca380..b12ec0a3b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fa0236db82315f7dbeac5dd0033a5b91e98c78fed338ffecfbee527ae46cad6 -size 871954 +oid sha256:775b0b31254b52feed945012abb733fbbad093dd3b1931c97a2e117bee001309 +size 842158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 7dfef821b6..97563dcbd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b62301c9302407c35aa7247c7d7a464b9eca85eb4d019b0e00bd6b69eb6bf2d -size 779842 +oid sha256:8aec18c10ef44fc17e6c74246195cd72add91a05924676dc29fd2a429a598f6e +size 753842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 54b07be0f6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d92ee16e232949ae212facd1456065987aeadca6dbba6bb4220e7595dae7493a -size 603887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index b562a07481..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b2f7bf4cfb5028b0cdc15d765936b09876438012ccb6034eee805f17680cdab -size 503435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 71bd6e0aa9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff15ba338aebcf63f10b718e98df516f6f7b9f80c782ec94143719bc5b22c0fd -size 572311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index bd4a244956..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1982c8f4f67fc089820e1e7b8467ae43904bf1f9d0fbfcfc5fa568caf9d82de4 -size 474869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c81c449083..0ab619c517 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2465ca8dac6b3b4e8fa3067d527adcc53e5320fd08895a1089438810cb232719 -size 924192 +oid sha256:92ef14dd5237830ed97fce64244ae0cd55850d9780897d97af5effb6595c6be9 +size 915804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7c31ca942b..09670125a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11ff9c1a3238554cf8f1bbbdf9b5bbe47cc1c3d393a18052756c5334b1457f9c -size 734898 +oid sha256:fe1b0b3e4a2e168104e9e86a2605b60de3d46d25c4c26cb1ed06338139fd6c25 +size 726412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 5a495bd972..7197a3731a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df45df67748e76de85c1b34d7274c64539341afcee0f3e7d861052f8813c89f4 -size 809588 +oid sha256:1548490fb18c08de3321a1b8ca1549927d783f807d131b2da29c6b8bf6b1dd77 +size 799524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 91fcff9b98..2a6f09425e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1661d5a937f176ce69fdd7bcdf1639949d67b9ccb6b33e0577fde27a6e3408f -size 798982 +oid sha256:fc035753e431c54af08cf7dda2d6264042b6043906ca5964abadf5837cb6c83e +size 803274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 65aef15533..690c2cea23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbe0734792a36319d9ca0716daa67ad0dcfeffe0ed722e43638bd0cfc46b1fab -size 686844 +oid sha256:7ca6b55209d89579fcf2833c81ac0a31d1c3e56f71b78b3112f3073690b41139 +size 677570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8c440b434f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8510d2a6b5ef9390d0d0df4f5e56c7a5151555a3d0ab802aee8e1966ca659dae +size 953322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index eaa65a09ac..4c70d750bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:febdee9c104e6c7a211bdb1982fb8c112b16ec199fef8c2dc5ac00639e3a7837 -size 624070 +oid sha256:236d4b569799623313b6ffac697c0688f2faf5bbec48146b998695df0994e9aa +size 627770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..43c414f694 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6db1c728bbd413a1ab535e951ba7b7673f0eb0d4b3cc7e748e8b2447873cb510 +size 690422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..67ed2c7f17 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8460260dc7014667816bb5d3d3f2200bf2262f1368c9deabf51616f2178719f5 +size 826582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9856b1840b..61b9ea7666 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b24f4356e5591d52f881769567532a364cb3db894a33c8ddafef5efd9bc2fcc1 -size 584303 +oid sha256:dddd8161f524f6e46685f391a188e2598b5833266d6f18d929ffac2764db0d8b +size 585339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index fa786a2db1..74fd205063 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da58896d1dcbb19d22269eb7410b99943878198dbd54161e58fbee64032462b0 -size 889602 +oid sha256:9c171cb7078be36c9a514d28ea67d8fc5e184c89957c301e265cd786946ff36d +size 862322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 95e2b4d062..20e18c3228 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a5aeec31f71c2ff191740fe48368ecfa4161a77619594f2e43dbb8ffbad9d5f -size 723792 +oid sha256:4e06fa39eb4abfb98537dd709433d9366f62d8e1a0737ddbbb28e2fec6b4b546 +size 711852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index d1769b2382..cc0ec59b86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6946c2aaacf0530736b563392696ba33cfc86f7f587684ba30a671fd743222ef -size 787036 +oid sha256:c5b7d4cd9941b3f59fad19eed26cbee268d822a3fd716f22d6e0d351c351971e +size 774260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 5e83544987..a3000cc3ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7fa71b29ac2f12e99699623807b6ba3dc1475058414e3795d75abd23840e036 -size 754624 +oid sha256:e7c5fac69abefc45f72568676278f27a83bbc34b5309e1e3b1f5712e68cb1f7f +size 743032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f0f32c307d..5fd1326d05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48d9d52223acf94e463c829724bb65987b69efac4f450506cbb000ca97897878 -size 680130 +oid sha256:6807a8849a40f870287a473ac043ae986ddcd524b3f7f94449e55c66de57af2e +size 671694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0d05306334 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2a797ff473b17f492548acc80ca426ccdc7ca15aab3f6bd2a7b391a61705919 +size 899790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1627582210..77fb62e180 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f07790c387f47a73295f842d1a79df69e31fff4c662dddc915dc216ad3e318f -size 610003 +oid sha256:286b91629f1ef0bf8f2eb815e7541f0c1cac417e79fed2110af69819b395fe72 +size 611631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cb58c5723f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d67be3d025df626371b13b23df634d5dc47170a91ed9e0f26d174a4e4cb1e825 +size 665108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7118fd2655 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e5829898a61d48362a571f5099672f6a8b929f0434fa18686acc18657ad0a47 +size 766390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 85092b3901..6269bf7cee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49d01578e23ffe5d1331edbe0461da10a077834ee7fa4a1f2a3efdfb4bf27b8b -size 575517 +oid sha256:06cc415045f2f77a4b8c609eec04806ce476c9457b739b741845b232589d2e48 +size 576651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e71d375221..e24fc73dd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5f694f7f7c4d698aab72197cb9fcacb1b448fe0e0f7396c426da6e59a4ebfa2 -size 920706 +oid sha256:83ad9b1504a4a1acc7379a527aec80d5f06ca8d8472df23596c3f40f10ccddbe +size 890268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index bd41d07f46..5a7c7206c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63716d5550c2731bdd335a1a7cbcababf7c7acaf9c3b6fa4635ecde24eb9fb51 -size 875874 +oid sha256:8f16da9c8875eb83c3727bac984c9058996e88463faa0b5b804efaed90814312 +size 860038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index de97cb75de..b880bb1470 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6d11b71ba5e1fade7a7c318379c99ad03d408711385cb1c550da0316cb1f4ca -size 824744 +oid sha256:54d6797f63296a857dbf3a9d78fdc54fc7bfdbe20751539735f59d662f3b577e +size 796624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 43e3fe66ce..4f11989e47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0205dbd3df831d1011674def428fb815afae4aef569aa349603e7309c2f2ec7 -size 778186 +oid sha256:ac608df25dfadb84b88d69bfbc39875794bbb1c80f38636164b90c555ddca6a5 +size 760574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0e0db885f8..cd6e36c64c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc71b743c302521684054abfe5c0551f95659d335f3ee6cdb585185c7761529f -size 747556 +oid sha256:daf63f35e22ffbe8ff4d9ee54ccb5d6384471fbb5bc5cbbc539b57dec4714005 +size 733990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d3e0c63993..e86da71fc3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb0cb2ae3b05191e95213f7681268d5b04c59227f9e9c9576ae002c2498e3a5e -size 636450 +oid sha256:280bea7620b6367a0557c08a8bfcf323d930b0fe436022205ec1f0320f71d88c +size 629198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 29287382dc..37bc9aa502 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19d6c899dfd455b0cb8a2981841f9426e2e4f74785a0be3a11b42fb55de0a505 -size 839070 +oid sha256:40d6e97a36f5e416061034935baf31ec339c5600f6914bea17cce1ef0ab33fb4 +size 823530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 7473080383..41f552481d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75a3a18690c5fb3e1f45907c6a29aaf1154e43eeaa1b853dd5c1ef14dea3c583 -size 723818 +oid sha256:647951a2a7a6776e19d25cd8e452bc40920d9f0d106174623de5ac8a16d4995d +size 713310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 4859f1a96f..3999261a77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8279950fe32e24f8aedb09df2141e1f4a97f80795e39e7ce8058ac318e6be2eb -size 757818 +oid sha256:08f6d0fd760b15f1b0596383f64845f1568d448380b6af65c3af3e253498f803 +size 747162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9e94e59c5b..c5b1e29ab2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6f5906882250274251c5e3f61e60cf1feeb130b57282a35da4937c73d7d37aa -size 658748 +oid sha256:0a476f0d74f238614664f0689bc790c74b51103ab326bb10104f0994659cf4e0 +size 647500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0d9de4ac4a..75aca996f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb029029a3f0c80a5a2a3a53ca4c6ce40436e774cc79f0aca7712e15bc3f1dd1 -size 694176 +oid sha256:54118d739f56a91b0b0e3f242cf0beeab0e15f409bb26ae139899850b9714683 +size 684112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a978e79d02..7cf54a00a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7299ec606625e9ef424ff33058bffcb8743e2baeb6bcdbd611e80de810340e0b -size 589975 +oid sha256:e791e659ddc2572a2b539ae8a71beb937526e70684c1cbe725155c760badb590 +size 583907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 60fbb44b07..382b1a2324 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2351e351d1015470f6a932fcba64dfb1415917cf9e4637780b23c9c69703e754 -size 945988 +oid sha256:ba01c0fba9c92146811cfb2533a3e218ea898559c4302a67b62c6034e4f441a0 +size 913528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6eb268f126 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0661fb2c71eac41234768ba90c067df57484f7e47878d189a55835e150dec081 +size 898296 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 799f516329..ef8f0c819f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:635dde2d67daf05242578fea62b4f02804aade374eb4de9bad9949d9a3f9b355 -size 849484 +oid sha256:489aaa36db9686a96e1e053e566bb2aa718d953e0760f970bfb66fcb9f5f28bf +size 825114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c9e90daff9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a01592c655d54913d9a8df064db3f1f56d8a76b51c712c4d36956235e5675178 +size 796908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5e50bcc6d3..40e2dd0e6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07195bc65eb79902ce38ae4fe914dc9d60e856106bb71beaa43785273d1ab972 -size 629426 +oid sha256:e9f9f50a937f62c3c7f677686361c71d1071ba494b8b2cdabcf5d7814bf67d6d +size 649900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index a3f0c22d52..f712f9d85f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:294b4b24e42c3a1c34aba8c9e18ff0957cd3bc821ab20427519c13ce9a35dd8a -size 526361 +oid sha256:9f30a3f498a085c3ee46d3a34efd45a7de163636c669b4a4ad2ae91d1244b0f4 +size 545995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4ba3734243 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99e209bece24f13e319e918b80b667052b15ee59871272357b8f3456b302aa45 +size 728094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..eb7ccf05e3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d3f5bcac763e3b770620d2aed049d280e48a202944e5de7deefbf88714ed749 +size 616789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d6ec99a54f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b252dfa0eed1ea7d3a1039026ae4a38918331e83906e744b8966980c60198ba0 +size 770520 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..21b3861ff8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12811b91cd8a101850527ca8541ac230a3bfb5eb384b683c848a450ad230535e +size 670858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3c950700b4..5c9c4cea6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1812dad30f5b40440d977c3528fdaf6d9d5a65b0e3f469d3550a81642fd9b55a -size 595039 +oid sha256:7ff75d52aec070dcf840ddd7b40deadd74aab9509e7aeadafa0497f62e3ca3ff +size 605843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6af369e9af..12176a7983 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:102c642b73760c0118f586716afc4a431b3c6591da144b3c6c42688300948902 -size 496215 +oid sha256:9189f433b18a6df501ae96b0fe975a6d323bb6d977b2196eb7d1365205c860cc +size 506379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 1521ef6cd6..466c3b571e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55c3450097542ac9320b4e28d5ce69c91cdfe96d906cb19f737c4218f625834d -size 688904 +oid sha256:7e31f776be06aede960640ed427abad7000021e71a15ed0020e61b463cd3c14f +size 675238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 43f6b1417b..b41b722e67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63d2f1d472b462303b4a7a536e4c41048c87682a00b264309d52136e4d720b75 -size 587465 +oid sha256:fa2d0f8d5848018ab7b16fc17f6069729253ef0547f4a531845afc1854fa4b90 +size 580509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3f6c5d224d..ea8671e3d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e1255ddce06aebae213d92ebd980b1454bcf9806a8a03ff3e5a531e1e10adbf -size 710486 +oid sha256:405c05b63b129d86e9dfd1a3295c5f9874946732ca6c88bb90f8211d4686d1b1 +size 696722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 222ed3d428..aabfc03109 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d84a75aec314cb9dcf738f8ac0cc9862782905dcb0109c6329c4ad81d4c843ae -size 607125 +oid sha256:3c988c79050714718c34b78b332e87f4f69d88beb0692d2db2af4fb79c183aad +size 601549 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 910e5d003f..8ae460182c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e4c41b53720b70550f5fe852f1cc60f5e37012b15f1f31e719d27f177f1a475 -size 684808 +oid sha256:c506c4978189a5a3f37e63b3489ddd9678d73ab77c7a3468995432c21f65dfa0 +size 675928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index dbf9a3abaa..77a983641f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe61a9f6cc8f966218f067d7ce468a7f286d87f9600a6071188aa5aa5767b7f6 -size 604039 +oid sha256:e994d15c202dea5560312f46ab637e0dc9cf9920dc71da4315dd66ed8dc1b52a +size 595653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 41b98ba5d7..d6992ee979 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72c349b3ecdf85c159f08f6dac5f00d1d82dd0d83e6e68e880f2c0379e73067d -size 704072 +oid sha256:7384eea3999913857ac55b0980e62d02b884ffb632090be2876530de745539d2 +size 694994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 339c0d69b1..178a8c570c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f04dbc0271bf099adf010f441b33ef6f89b0f4f7d0cdd6e5b3103bbe38362f1e -size 623156 +oid sha256:02deb1c065a3f4e65e5cbdb74b6541632722fb7600b84b0d46d371cd3f5747ab +size 614473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ede7baaa8f..35013330e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b02ae961c09194d7dbd594c7b84ae5bf505545100922c806dd93619c4e4d2ba -size 754450 +oid sha256:13a63d124a79648c86f67db3d1028b2edfec2370ea21eb76b0e1d4a7b0e4133a +size 745028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7d3ed8d272..841f4574a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15c29fad74db8246d60548d5f86b7882e13d83d6bac5c60d32e6621c8ac2bd52 -size 658292 +oid sha256:511f2321b4dcd5f802d0bab93b3bcdf26dc312c7bedde356a9ec9278de8b635d +size 650398 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8a2127e2b9..d2aff7bd64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd236b467ce3b979842b817fd4336ec45bda78392479d4926066e91aedbb7ed9 -size 776132 +oid sha256:7a2c1ccc840a7392796fee8360b5f913a5b472e8b35a774e86b500f58382fff2 +size 767202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 4e7996b38e..809dfb446e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3a55050809a099e776bc5188ca2ae8f0f97eb47575154da93de09350df022b6 -size 677852 +oid sha256:0638fdd1fa461561b57ca1b1ab79ff6a220c29662d7f6355fad17a5bce7b18fa +size 670650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8b7c741f45..b3e1401e44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b17151df4d768b3c7925a2853a64e1ebeddf40b12d305d831a2304e41136ba46 -size 845522 +oid sha256:8569c6817ce520dcfe3897df58f479c9d1046fd14e0d6933c8ba80515e016eff +size 836692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c6fcad3028..31d11b1aa3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1feee53aceba95d45240ba8c781f269ca8d55bcedd1863b4ac6810a677c624e1 -size 848480 +oid sha256:327b1190c720b407264618c95d58ef761af2e0f63bf5005d76db22452223304b +size 837972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 838a928698..f9c73e039b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:465571935dddf399a671126031158abaeb8adfd5296457bbf8319f665bdad9b7 -size 932446 +oid sha256:bf98cc2168dc1f29d1a79a5c6338aba25c58643474e9ef71a16b56876936e693 +size 919126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e2aa57a28e..92be3aa172 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:531bc7a24aa8a903a370cf8d39a13bdc75b7457f0a6ee1d42fd0b9208b91cd07 -size 775170 +oid sha256:8f866fc027971efd039500eb540296cb292dcd7952e7a779392e286bd4fcb1a6 +size 766192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c39c96c9c5..64acfa8d60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5f3874b1e86464a727dfeb34512ddaba5e71e471e2bb358b16b3252b304fe5f -size 805608 +oid sha256:4ba088ce71988dad8e6dcaf6f635091fc0d2c055b4ba76ba778733ab1077adc8 +size 794852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6f817c49e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26b1cfee7ce907db0a635577d6d9ee8ed01c3914286bf534e974267d4fe0ee47 +size 880228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5d87362bc8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a021897701644fd73e049b3bbf71aac1450b0bccb9918c58567ca3db4d16a522 +size 663454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..97f7f906f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:080f2777d9e9dfb5342ddf9438cfb16848f8a7fea659cdf628c7200479f6d690 +size 728920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..10eeec8e70 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a95a8d6e8f81d513607e85b04235ff1136d47945cddce6c42dbb7a7024340e4d +size 787626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7489a1729a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:555de69f83ada98adc541377023cba8a496f2d2a66c6271339cafb95b6abf622 +size 627588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 978ee0db83..bc27a3aeb3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b193c6104856ed17781284c55c0c047f6f1fdd2209ed7cbf83fc860b67e69d81 -size 764412 +oid sha256:8f32889ac043769915b46ae532abb4d05e75112846fc23ae5eba154bedb3b3af +size 748822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1af7b35ec7..d01b58126a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d0be720ee02388b94e13e471af46cfee334393867bab314ba25065b924b133f -size 830912 +oid sha256:1c3c15da3fb45bfc16e13589d232a58767e345fc982ed5f8f09f5a7d855ac52b +size 819318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 89a4aefc5d..afad69430e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7038a4e48cd4bf0a5a9a14b28ddb3349d8e13ee10a2212a9b280900c13922e8 -size 900916 +oid sha256:9447a58f95d8585c583cf9502e0ebe344c1b75d64f6e27216dc8e1f1ad5068a1 +size 886510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 3b15a921e7..7364ef5664 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2fb2a8a0da2be2ebd76b55175f37bf88d3ba74735328362885d63cdaeaaa27a -size 708712 +oid sha256:a15f924c6719679d5dd3ba599a36b0d0d1a50e831131aea85edb7726af4cfe53 +size 693074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 62ad79f3ef..00bcf4efba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b1a2583d9ef3446ba0766d035fe95c543bc8a70a489d72c6a2ede73521a1d64 -size 795340 +oid sha256:4b7b936f7a0bebc48b28117e21ff31090cff082fbcdfa27e7367212aa0450a7e +size 781674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0dc6c26c79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3900fe27b8714a90ae79c05acd0464519dda21536f006b94ea9783cbd245df91 +size 793148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ec83b5eb1e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd5bb5cbc060a7c6716404b2792b1cebe0f5df3b746b340d9332057587b711b7 +size 644160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6027469e29 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa25b779eb670c9537d329155c6fac81a6b4c4b32e9a89bb2efb2b68e0236175 +size 698722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a127f39208 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f07cceef7b22479f909797f1c20f477cbb988d9c037bff064f81878c15e346d +size 715298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c95c43e8c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4437f80ee4bb36e78da812f590b0bfd7c1cc8cfe035f2e4f287bb4c7831273e +size 613767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1ae73b317c..8c3661b2df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2979c6666a940efb3a09825308ff221d66d128d1d6d8521379f12fe6c296bf11 -size 796552 +oid sha256:76698e70e993286ee32357e80804d135942c81e65981a018e43839dedfe93804 +size 785204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 776be75906..6657d8081c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b0666d49a68f8e33f0bba39a7d94ba797e235f6a0a02c210f5a28620459a07f -size 762720 +oid sha256:44e5afc4442ada9b8de02bf64800c5fda96e94e4e2fed30eff333d175436fa31 +size 753988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 17e10d4e4b..8408f516d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ab2651031541793dc88702adc09a4b6a88b2e07a32f4e20adbea4921c53f8bf -size 695952 +oid sha256:a1ee3c870b820716980ebd587bfb7e67cdb52aeb00f558505490d9a08a7d7cb5 +size 688404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index fdd7554619..10a30ef6d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d58bcf681453337d9e0145d7d2e826dd0a09c4f71f8538b043f6b20d0bde2de -size 663010 +oid sha256:0f958bf5d6f1115d81556e86d49ddb03d5f262698ef6a1994b3aae207bb9b2ac +size 655412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 00d230fbd6..9df83bce02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14011d052ae63c8064136931c28873882fd5a314c6ae4b446ac903f44ca5ea27 -size 836374 +oid sha256:8db8cd5b2fae3778ec74e7e6ae5e7ed7be5e986bec7ebfbdb0e48e042f84056c +size 824040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b6ea091661..d3a0b5b57d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58f550c9b3ec3d86e1ed9a496fe9b886e61c5c085f4422f3973b774cf9770d75 -size 726154 +oid sha256:2ae77dafc73df0ed8aa9e06e1f44489d5302b6a34273d46a5f794d7ff141b4b7 +size 717374 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index af6893a4a4..593e036529 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da8bd957c7bc4fab3a7b8f0a3c5798c1cb703698c18c76fc028bdd79ec0eb979 -size 922608 +oid sha256:5a061911cfb67b74cda152ebca34d1d1d3faf7875f53c1f68deafd7ceed407a1 +size 915800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 20cd88bb5c..b6f6cbdfc3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eacbed2f3a2b299949a3bf7df44bc336426f117c36f1288b61e7f5c6b0861c81 -size 820776 +oid sha256:6e0d1c9c8fb325d132ac5e2a793303aace2892cc0c8ccf9dbee7e4200bb005df +size 811254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 65d7ca6f28..1c64b57159 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a90f8ba5df8a5461d7d1d798ef1fa8f225c86ec949363c16e858ae951fcc67aa -size 710574 +oid sha256:5d3193ba6eaee088cbbbbad7e1f3f486dcb2801bd8f6148c71c0a3ae38e8bf3c +size 699522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 0e75074352..41169e0109 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:beaf83b4d4bff5ee47ccca09b6c14b23cf32ed29b6e35fff204b27aec4ebeab6 -size 606373 +oid sha256:ab9793a10a0e7c2ba58beba4b78176e88e2d6b79baf0aae23471f07b35cbafda +size 599761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9e32c83bb1..f90751144f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a231210add4f3d51b069d8e3b783ca3c7ab3f1de3ea3d8176ae112413100993c -size 768884 +oid sha256:d0bc61038528ba96d6fccb2e9718433806b8db3784a2d5d6884750f55b10f29a +size 773668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2c0cd30838..8c3c73da0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da5d42682adf22e496460f472dff380f7d7780657a39e9a456f75342bcaf7cc3 -size 678398 +oid sha256:0a3dc6e1c62e5afc970c769bbb376550b1a7fcd0e5bb5f414b57ea914f1deb45 +size 672280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index aa59ef5e66..7bda2ebe72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36b4aceedb9261bc0fd1088416536b0cffeb94892c57ec1138bf504abac0c2a4 -size 816506 +oid sha256:eb1268c5d4dad9fad1ecbd0a5ab4a5634833dad64eb81865e36f8854663873c6 +size 806048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..365fdc1206 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c9e402b99bfa1fea0e84f5bd078dc7feb5746749d886ae89ad5fc15afda5112 +size 799202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 7b4508b252..cce36d107a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:729d092b76ad3c5452c5899d9cc6ada904bc7a4d9f6cdac0039bc4558c8f41c1 -size 719952 +oid sha256:7efc4111f444f2787ed3f1857e2c6b3b188f08648d64c0dc712503a10e954022 +size 712208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8053a6bfe1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:737d458089c8ba14b44357d5637014ba092b7f7df9b1ae34395d2828243ffb86 +size 699936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d204d479d9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1430e1878c148c855b2c91289f8cbcdf8e89cb208dfd4f6c138da5b02bbd47b6 +size 678284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8758630ebf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06af334985a89d77d22e80de7757265b45790fedc4783f7db1eac7188cb98f63 +size 576599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6696c96c2c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4e574a1ca89c0f54e3c7f5ab402c6ab90ce2ee56075297278d20e5f6e0afa9f +size 751692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..94b6f66c54 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb80c445d8dcf9736fbe3977b41afebd6793060cd373052202fc89a7bd166a55 +size 649022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..efc6e055f5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbe463d2e13cd477a789ce297cadc392d8549b5665a1b3613a505a76a9acb391 +size 724854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2e39663fbb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46651152f09b3dfee60646cf7b800c096abc63a404222b46b27896c83d5adb99 +size 621986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6f130f4dd5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:465251c8f6267a646b466edd31b3d65434bc535e7f1ff8d690f957be0d537251 +size 627124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fab259fdbc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:378b87b1737704c560f1e086e2799d68cde4bdf9e44ce333ad152439842a7dd2 +size 538067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 3ba3579c1c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:351d60624dd7f8c41d61d7fe69dcac29e5d5c94dcbe7a7f987260e49b40b453f -size 663452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 65b75eddc9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:401ca6aacfa2c9a2592764b58a0031e38130e17162505b77b05d0bf66b982bf0 -size 628670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 201d0d79ba..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce4ed366dc4af90f9c550956e5c77564dd5261401869e161ea10061fe51352e6 -size 645934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 4ec5e2aade..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4efe673fa5982c3026b8991adfa9014a9a4b8af101362cc11189a691cac65a8 -size 616775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ae27f62e0c..b46dc6d872 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:336ba319b72eece7bfe1ebc09de0586e54dbac0982910cec40b68510cc473ff0 -size 796598 +oid sha256:af426a95e33687f31cabeaf6ce8c44ad7416c4bee37760e1ccbd5bba956c6ab6 +size 785054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7fe6f8299f..3283ddac5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15443b1491b7ca31755bc7bc9e10a2867fb276c79737f2d45c0539644762a0a4 -size 714106 +oid sha256:d2c181c2b81aac735429d665e075816222f4e6123d3f4861a76cc11ca9640168 +size 703450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index df8f48dd75..e255548502 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59db77c013478ab8319988561d179b66414044bd632b6722d5f3e2e3bb0bedab -size 819562 +oid sha256:e230a170832419f496a62ff683f192743dc31c80cee65bc5e35a8cfebcf23903 +size 806194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 1a71cdc108..397f380978 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac354e305a4609fcb3232ccc7679274d5476fa6c126be678a4b81f87913c1e52 -size 733222 +oid sha256:317c1f9c6081f4d1aa4780aebfdcd1d0c83d31b102abf4b5dcc6e888d959f940 +size 722318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index dd9355c170..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec6b8644e90631acdd2ad66801c231bd8bbc88634b7fc12b79893b9db1eed013 -size 664172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index c616a2793c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1a8dda6282050ef85dc51f57d7474c4030acad8afb8ee2f01a62724d118d49f5 -size 565595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index d7fc30dec8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1788a6f87d88d9af2f289901bf224a8b12451a346e223c4777bb3c99cea4829 -size 627862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index cddee083c8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d447e72393534ada7fea1cf3b0ce1cefa90319f91cb9b1fff0df80d7df08e826 -size 533429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index abf2e54b55..5b44366a31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed9b37cec9c6e26796bf5878798551d40b0feecee13efa63cd0beb28a4ecdfaf -size 897354 +oid sha256:5d3e79807db193da793eaf9bfdcd233e460a09d6d56a791cf71b66c683c4ae10 +size 886352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4281e26c39..4540c7d7d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70d94539a25a11cda3c64d2f495150876341e649a0d1e0e4bdbda8f4c08aeb04 -size 876336 +oid sha256:f3bfcf90a232bab310113cab0185c38202f18aeeb9b719005192186d65f596fb +size 868196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index e36f5acb3a..3eeec6e5c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:806480760d6102222af206f4525a51cc940bb28a51e5ea9b34903132c20715a6 -size 963606 +oid sha256:69e6bcaf91bbfd1433988f7a70bfa5f02b27ea24ed70b124c60d70fb0d62b19e +size 951964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9a300b3b8d..6a1cfb6e79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f190ef5d5036cc25f888dc9ebe22bc0b5289975b4a43b38d9686b90e42c1a63 -size 813238 +oid sha256:bee787deeef0bfe60cbb61934b74345e6f7c2ad900d65d1f5844dc9ab303dc89 +size 801990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5c9207c286..9a4a055245 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:363ed93c5c91d083701cb68d1c922f9361ce125dabe4d98e6a45530735b7dcc9 -size 831046 +oid sha256:a3debcce06d3983620b3c127e2226f6154ed11fb12885f2d6a9899aee49cb522 +size 822906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4c598555ec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59e5d8e5b8b24d5c1ccf3f293205c0f95350a247d6d50c0a2a639ab914e01ba1 +size 911636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index c7c0984222..3686cf8753 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c18a26b9121d9d09150a10f9024c8645794e057c2fed20aaaa9ec169a737fcea -size 689732 +oid sha256:b06c31e65056b9ea76c6c48aded3cba9499c02f3e473dec85bd0b15a042fe0ff +size 692100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f49680601b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c3b9a633b23b21516a142ca26d3059b0bddfc8fda2fd56856e31eb07a14b181 +size 761512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7f95307474 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:560acd8aa8d7cb6a586ac86fc11140c618438420db708e8151f7136f06afbceb +size 823426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index c6fc3b8427..3202eef34b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:540a7a144f9a1085b0f75e76a49daf2ec67829096f14d2a252102d362e2154cf -size 653322 +oid sha256:4a4a2baf1ff53a6712593dd569788bc5ef634619da10db1fc4eaf1e1ba4884ac +size 654062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 637b24a09b..6175610f22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b48f39de0766d14701b01f5576383add132ce8ac479daf5c586142e114bf2bb -size 814122 +oid sha256:2f93dfbc1b90ee9573a017830e27d8f5cf407ac90fdef78432a60dbe10924c4b +size 798484 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b612bcefeb..5adfaf2b40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7182f1184d9ce3d0d820f37d284b9103fb0c7e3cb78ef56bf4ab927de970992 -size 859556 +oid sha256:8d1c2bf1cfaf9105cc0d00a7eb9d517f5a2d3a4a6bc895e55231949124332e22 +size 848752 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 004f97f62b..fa58f57f22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32c40d88528a35ac2a68e0bb16bd609a162a890efffcc7058ad3d3378630fac0 -size 931880 +oid sha256:1e5bc276b3b7e522be360ab712c93c295101c154439de01ee88ad7d5eac2965e +size 918362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index cd97b37bdd..e474172289 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a23a22afd43c46adcda10489a67be1e6243d31639117d7aabfe208b83f1562c7 -size 744610 +oid sha256:8d30b12a7a481ce432a95c8c93941a0d53bd7daba9e1121d6f91e54d0b6aba9c +size 728872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b127928a00..86ed4b501f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c651307f1f31836852c156f07fa18aecc007b737b401fe5fc3740ac90857619 -size 820778 +oid sha256:2bfa6fc6b03328ad47b4fdf4d9de12ecce41c10e822b4a302bf565cd81c842ce +size 810518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..14183a26bb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b55e257dc5bce5b62d2eb4e124fa0d6ef548a8b35715a5caa2567742705299c +size 823816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index ca443aad38..4fb6e5bcfe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9668f0d476aa4a31b49499f98eb84d16fad43d3497e3aecfd1e4b69e2db66923 -size 672164 +oid sha256:741f4535e3e91d9456ce108e40740ad5039107419b02259581965bf270a55da7 +size 672016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2f7d34f8b5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d36bd7ea1137476c84b2cae8b2e7b43b5e6d0d80ad07274dd8e803557af1b1e8 +size 731314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..242127c4b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b5b0bafbda771bb8591c34833fd67fbb4019ab4d3248cd626752ac03c83b772 +size 750308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7a4073061a..b50353d716 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb11313d94364e2c15c2eaa71d7a286449348a67fa9d1933fa820b189b19c4fc -size 642216 +oid sha256:e2cc15bc0b249b8a723fce6e56db7858dc2ada371ce0772b6bfb9ebb7131bedc +size 640242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index afec15547e..761e799298 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:360bae716502b9b6b0b0e8b3b51bd340fae5e54b485d9ab8b5e2c7aa9ab1d171 -size 870436 +oid sha256:2951197fe1882053bf7c85cde96b277bc467f8cc64fed9e57f0a312414fb8f30 +size 856326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 61ab681801..3c66e065ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4edd82c83863a1f1d94a3bec322885d06ecf8c73e0676c5ba66c124d914aaf51 -size 810804 +oid sha256:d961ba4db8ce302345ee0ec85a4b942259235baadd6d71dbe72430121289b159 +size 801084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 73ed1bfcf6..c4b7ed3acf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8eeed4baddc4039032eb1b2b05ee126b6050837c59bcde40baf79d4e047bdb5f -size 766728 +oid sha256:80f45acdbde542f5556be9d7111bb9ee51f9a5436877c5fb01c4e9461818f80d +size 759576 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index aed068e2e2..b2d5bac8b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a46fb97af177a447a6e44767e21814e987fc9c019a73d7eeb474eaa5ba339d9d -size 713116 +oid sha256:6545559526fcac473a136c539e658ab57e7694bace787af05bf4f88696066ca9 +size 704728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 47b0226a48..766f7889ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36788425128cad0c7df1c311078e0d62460a0ce92d1b86c14ceb89ff2e4b9802 -size 864772 +oid sha256:8234a87930ba0187ea1597119babdd8082d1dea0c4aaa6b7b69d782958be9b90 +size 851994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6389d8fb20..8b2122bb53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:589c57bc9be05d396b2f9a7d61828e4c0155f0e6ac78ce53348ff54d2ec8da97 -size 752086 +oid sha256:7c628495c226bfb4a9b7a809141b00bd1f88fff3fe82335f92c1c2c47d22de9e +size 743946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index ebbb65e549..71672df72e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:112496e6ae5be1208ff78555fb3f053d155a070f4ea22a9f3da6acbb2034092b -size 957272 +oid sha256:259e11a16b2c7334a783c2785ce45c2c4c3257bc8e1503cde951a5e400d50eaa +size 944198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 84466bda2b..d9ad930b09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10f7914d5dfa1d5fba5c7c99b82193d1904dbb62dbbb59811538d01f17f43239 -size 853812 +oid sha256:960ac78286fe957014b70baeda0e03184b0ac31333484e6d10aa0a488e068864 +size 842662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 18cb8089ef..f064304623 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4dfdb00ab145d7b4f0296a23e067c487fcb4c8189603a55ee068ee360af5c8d -size 744646 +oid sha256:02afcafc0d9268ebdfd9efeee14e239c64afc7a8c348f80868dc03be580c6dd9 +size 733792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 898725ca0b..8b2119d3c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6f71d663dcd9146facc784eabaa8460d658c8be7029a183146ae03f3061a12c -size 642962 +oid sha256:d8e1dcdf45de28c70962ad30eb8c3600edfd5739181656689339a19e142c78ce +size 635118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b8e009d0e8..81d850df36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9dd03dbb40d1c72ba7476d24f396c99ffcc0ef89c18a5a188b20717b893b2c71 -size 809270 +oid sha256:bfe14fb27897bdf26aac3786c7963c0fcb0ddacfc086ca06fde979faef2c872f +size 798812 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0b2c5fe24e..b96372fc02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faef3949a97ebd6fa1f342255b8620ec336e19ecacfe6fbd727fdf7a75687a2b -size 701912 +oid sha256:c6d6001b89eed636be65228381c6c1fca7a50b64f76288164b665c81c0e40941 +size 695944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 904f64ec2d..310b41c4df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f60d7508d1b10f19d17ffffa8f8b27078462b7b0d40781a057290ab24c62912 -size 890242 +oid sha256:a09095b479fe2d081a0634265b07b3309c884ca201cc5a99e2c6dfe9350bbf19 +size 877316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..85efc5eca0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3abbd63c8366f561c7a9c44e831139c10cd0458ff66f9b4d4ae8a4558ac07ac +size 826664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 396ccb55f5..4eef80a301 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a9c71663b0a382af504aaff676c1c80afb25053f927b2870a0c6bee7f067b13 -size 787276 +oid sha256:1b6b4e8bc2f04da81d47389eec8ef6e70f895aff0feadd1643e447daac3a014f +size 779924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..17f5a6e50e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44fcfebca43948f1a7437015261326227d37d17e9c7ad4bdde26611328f4e220 +size 730356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 78174ce71c..a1a8336160 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b192308bc3d7358cdcd3b35dfec5f5fa6b745db0ea68752703b0d2ccc7cabbeb -size 686998 +oid sha256:4eaf8ce42c4a32c2ffc31fd79b43b1dcca83322fad6cbcafbe0902ae8af390e0 +size 707768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index e57e3752b8..bc24ebc04a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc46fb1a0a8001feedea383acd572b22b24604943c059bddd230aab9498458c2 -size 585561 +oid sha256:3c95e0131a38e4b12cf1fea0eb41abdfc08f15904bc849b5e3e827b0356f2f12 +size 605293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e81dc462f1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2544154d0c0cfd9dff45f4a036b738628f9fa3ef24683192b6889f88491b295 +size 785714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..22a8f62aef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d575638d857f2c98431a90c6323c1dc145747c5c6b5d5036f354808da465135 +size 682402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..30a6ca318e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7edbc1d6f36687730597f7712383fb4a8b62ace0f21fff3bfa52298382897f5 +size 757496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a8f1507874 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f188a4fa5cbfc4e202202b5c75b442f20fe4651267508c039819ca5b9a7eff6 +size 657342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6f774eaf11..aa3f8b567b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:913a63af6fc1d0134fc9781efba404e7bae6257d46b15afa3110ec7f1e1ca7d4 -size 650934 +oid sha256:469722906ff6fb46d318e2f647c383746b4abf9ce302119158a373405c05da7b +size 661886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index ea1d575bc0..4fbb1c5805 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bbacc97142ab5b10aadcdd80b67c726cf3635a3c6cd59d4caa9808e68e369f2 -size 553393 +oid sha256:49cb0e4828c7216cec7d63a20f5ddbe4b34a08ffd7b9c34e8df4baf1db95bbe0 +size 563457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 359093c278..1b9a8e830d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebe9edd911fcaf278f56b2cf5b28cf4764751e68dca66dd971fbdbdd9aa7a5c5 -size 679776 +oid sha256:3150617654b28b1c032d1dd1ee38bac94b49ce0a7fd3b6f29d675b5a7a64a759 +size 670500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9c0e8490e0..fa7640fe15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76f4522bd6a5f57ce509226ed226e6005153955ce7fdcb153769696c859f97ad -size 596985 +oid sha256:e6a1a2c8c8505b36a91d1f696be4a5b6a7b079f706401ce4373be987dd3fdbb9 +size 590079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b1e690b104..defbdbc403 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c84c80ff29d478d28a97537444eb12daded6a1da0deff1ee39e5d5dd071130c -size 716602 +oid sha256:ad7dbbd2cc700a4763750f831318989bf95cf4f96f5814363f5710dcbf52c91e +size 706044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 9d37b7f7a7..61d84161b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5998368e27fe0f457d239ce1662d8c2a6af4e5f8bbcdc90c0f97de95923a7318 -size 632136 +oid sha256:c5515ea3369cbd3245552cf4dfefd5f4506dc8a9bec4e217e38123d1a3abd9a5 +size 623946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index e65d4ce110..07160438d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5876acc6ca6cd7bdcc6496b1a4e6756d2da2ece7d9bd997094340deb3a9c8656 -size 681254 +oid sha256:76e48e9abe5d46b637da09be8cbcd3f0510c020595d2c556ae287f16e75b2a80 +size 666650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 5b7cde7565..049305173e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16f076fffccb990b8b77e34d849f994a29f1fc004dd20020f4439485e92bed1f -size 595897 +oid sha256:5182cf19600569d65f42a615c2b9dee846c73e298377ce5b01264df87b3a9647 +size 588991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 2edb9a2350..e01492e3e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:034b63e916b36db9c877d455d6b15c31f3a7c768c82ddf3b75836ba430818949 -size 714774 +oid sha256:8d7d84edf0201c4f10b036c696cc50ace9921a1991c90f101c205fd79bf44ff5 +size 705254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 27cc887b80..da663d336a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c594f433099fc24a9d016a9b56f16196b88140381ee365c41ee24c30bb223a7 -size 630110 +oid sha256:36c6324f2f11bc26343d9163bb4a6b94991548a0ab4c753241a151abc315d215 +size 623154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 1954200183..c15cbbae04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a1812496b30866ba4b5db366426a580530f268c93033a496755d51be0ae76b6 -size 748528 +oid sha256:c4ef727e0bed5bc081efc52a6618c6e5196e23ca22ddf88671b984d1a33e39db +size 739452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 8a32049c14..395ea7d2c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b26914abb7169b88c36af7cd544b664b2f21b9212756ce54a440b3ad808a73d6 -size 667416 +oid sha256:50ac2e5178872b2b162859d9f005af21d5cb9aa976801f69892e109de90c6858 +size 661448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 065b466aa9..6aaacac8f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f19632c85885f22b6337306236982aea04da3e1c6e467a37c8ac9c510a87e0cd -size 784764 +oid sha256:bed65b8f9d54a3571b3edeeb39c0127b06ac731a9867d351fab721f507eb2efd +size 774996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 9b2363e29e..91c6e134cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5349b6ea4d5941eec1760238594f984624d0b8d788cd5e7903e72e5fe8151e7 -size 703454 +oid sha256:3b8e8d2397a10291434a74683efce1234ed94faef7c8e1b53a789e86071d7e6e +size 696252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2a35e32c5a..6a459a7f9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fcae8d890223ef2ce45dbe04f786554153f065e124d43a6a78170d894c98db1 -size 794706 +oid sha256:3cd61d0aa5998ad30554f2716dd44aa0e518ccb4d2d38db41572489eccf2e626 +size 779858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fe69f84a4d..c473ac7d62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a887589914ae4f76c068b877c1d6c23f41edd1100f9540d7f5d82c1869d10ce7 -size 660666 +oid sha256:913264e3e5cd6ddac27f1e1aa4e2a8781309b303a67f4bc8b21deaeddbab2dfd +size 650010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 34c4385699..de2014f1c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ac73aae56d3b4ee613bf2abc94a17eebc97b6676545e6505c3b7cc366fc5738 -size 725934 +oid sha256:8ba51b7a6caeadd5a2d7ff2e410e5c68492240521ecfab7366b35bdc558f40b6 +size 712910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 1f9f7c6cc4..4ead33eaac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2771f64211fb315ca223fc365ceba632bac8ca2959f435742c71a2349278c9b4 -size 711726 +oid sha256:8dbbf31cf5bb78f8cea68776a86e5449b44ce1433184ccd8dbb2068e3e70637a +size 712416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b6ef0e7883..bab37cff2e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3aa0abc879aa1cdebe401a8831543f1d0d4143a4e397347bd9548c72fc37e8c -size 635010 +oid sha256:b3a35bb1091919c9bece8105cddbab02a95ff3689da4eae9ec92ad97c8ad971e +size 624946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a1913b974a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27e42c7fe289e15a9fc8e1439d35db121dcd2fef6dee40941ade34b05ba63253 +size 834938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c9c52716a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19c1bfcd6e10874676bac2cb2d8c792fd0c2687e10c40e51ef20b930a4a958c6 +size 571889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9f357958fe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:339aae8ae47aa0b886104afa63b4e94d85410520d930e14866e60e61cd502a06 +size 628278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6a0b49ed94 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82a31e7d131709d26c6fd946ce654ee7ef950ffff66ed3931fdceeceb7147231 +size 735578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..526cde148d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4954f67013c50a06bd297edeedb4ba08df03bd428d1b917b7b8602f34664104 +size 548305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8a56e9bca9..8ede1a7cc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03ccf7b4d71ff8299672914b2b50a735c3d852c69ca763872d126450854ceebe -size 738510 +oid sha256:bdb40ee1bd46f9e283a8ad071d7b4473f110e166634be8e586ebda7bf0fba099 +size 726818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4f685162e5..5943b20e9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:298b3353da4594726bb7a64bba53fab36d1f612e2b161a17814d220c26cef55a -size 651682 +oid sha256:1ba83690126181d290560cf9005bb84cabc4e8e29087e3fac5fa88b76c8e123f +size 641766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 034fcad566..4132cfcc39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:780f6eb5df1f4a510e7ce7626712822c966e684b14988b52b0563b54e199da69 -size 712410 +oid sha256:e9a027de3dc5068e88e9572638a7d86753dee4cf709563e26234cbb5fa6307f0 +size 698350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index b2508a5293..24e40fbab4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ac9f1080a78e8298da3262b5f1edffc651592f7618b77e65c1249b06508f73b -size 671168 +oid sha256:58e4002d7670c927505d3e88d11e1e86ce818529a2dc5adac4f7ff07b1c666dd +size 659328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f45ad72aa1..1be57f1dec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18f8cabd6d6b1c1369e7b761251dbff15074394181f5a7b44aa88c800b625213 -size 628296 +oid sha256:5807583e55a60717cdce364d80f10a604ea7fdcb9ef58ed753fcc63dc3150c1d +size 616701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..57abeadede --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f7d66f6f9368f92f53440941c719efbf17f8cde3fdce4676d4c3665824a13d0 +size 782688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b4c2465fce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:704628ef654a26c21d4971bb4ac9482f562a6eca081346b0a55fd38ac30d1b12 +size 562411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a359abe01a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f3f53c0babc407804bbd1e52a2eed35b5180e7dfef6cfa189a9c23ddcd9d068 +size 611349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..32492478ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a32577058c5d5115d1d9b8b1ac679885d5b0ef715e60fac2d7a414590a27d207 +size 682490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..83dcf984c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c5a691c4b277f9d5d452808dfdae75e60a5e8ccaa42b265554f97a0c18e8f71 +size 539617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6087f87d80..a6d613f104 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce085204716bd9ed56b1d705dafb1ca77cd527c70c68dfe69674246dad1c0a99 -size 769120 +oid sha256:68fc8cb1c5342056a7d7c23ad47e231eecdaf465f323a36e691aba6342a638ad +size 749880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index d46d49a712..aff2e9ca55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34b5e26ec3b1c36f0b99c01f0b4809cd95393a0030173a28cd63ea8df1ba4df6 -size 744120 +oid sha256:a19da6f0e2878438fffd22edb52e108c2f012c32994e6bee0972d3787f4a3589 +size 731786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 77e14f9040..9e479be8f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42b16b1f6bba6abedc962b98eb0992e67df932e3f2bb38516a7fe02412696934 -size 676810 +oid sha256:5c930b9c30e4246021e49be06aabb2fdd365241cdae4ee543693abc93cc8a6a3 +size 669014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7cecd76305..a849ee3e72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:377656ca5fd6f2f88567cda6f41944b85176953c00f96269cc0b147b374c1149 -size 649244 +oid sha256:6497709a7ae4aeb792be34e6f5be33415807568df0b665d7d04990087257d5fe +size 640660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 13c9348b76..273bcb058e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe329d6f59cdd94ae36adb143541d65c4502bbc2d3147f6da225558b96fa8bbd -size 678110 +oid sha256:241d40505b2259eb408be14e1bc49a8066422403a437aa9f87dfb5a4fd52009b +size 665678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 65e8c0966a..f2dc7e3039 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e76c4d293fd9745c330e1ceec524690157704ef7fd9656ba2d418b50bacc0e78 -size 582543 +oid sha256:902b5286e242e391f73544f238904ef8fe1a54ce7effb11fcc9bdd6e24c8635a +size 573613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 7761bced1d..68118c63ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5acd4d96ef0ae6e163dc88b44f7f31f24982ce899e910298bff213dc907c8307 -size 752948 +oid sha256:c814f40da4bd332617ecd48d8842f597b9cdcaa716300a5eb8855abad0a56b6b +size 737902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index d314f25a1f..8b50f5c9c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84f329910648e00b2acc93cacf942901d8404d48e8d81f0d24e94658fd358241 -size 658320 +oid sha256:f418aa0ed4fe113123b5659aa7e4d4d2319212c806493a8eb4314e2bc7080781 +size 640856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index fc7a8346e0..c3e9cd8078 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e5ad75c2d6322510ecd0bb327eaa41a3d887e763df94c116f3d7895124f1f18 -size 669724 +oid sha256:76a47ca9cbcbbfae134706eec01ae2b75c805dbad3dbb99437d2f7248e09c2da +size 659364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f7b36acb8e..441a7eceef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e7f06bff3576051da5374596b8f326aef0195f39a2a4ab314df18c299aeb12e -size 582739 +oid sha256:88e919f26d7d1e5d80b140ae50c7311f723ad94d2ba1a8f5fa3ce0e3c3668b8b +size 565967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 57947f5211..1f09e07501 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd3bda6347e9cd69505c0bf60225d2a61e419797ce42c810b810f83f02593bd1 -size 633560 +oid sha256:730e6fa37b346238ecd0deb73e2f66f4d9d2c25f3fc487565dd584b710b945cc +size 632820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9751d074ad..479ea6d5ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:179bfe1e0fb0e82082488a0d78e8e3c1e726dacfa54431b30a05002a832dec80 -size 547365 +oid sha256:d1e311489c6441fa97d293304c97da22074abd0d2dc3c4487f7b860e68825231 +size 541347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d0622f4641..1de01a1096 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f517088379540b0de25692421d33259e310fd24d8ac0f53727107ba37f9c7ea8 -size 799830 +oid sha256:af0ac9006ed5a32fc75646bcd60c4eaf4559d7b35efede6ae5c49f49d80c8b06 +size 784684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..407007329f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6279de8a9275465e927fa0038bc8d3f9e5e5896e645191471c64e2f2363d46a1 +size 771524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 4fac110548..409f270f0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:768ebfa7f2ac5de3c9bebf7686ade0c11982a34e43b90957ff39458ae30b3072 -size 711120 +oid sha256:ef26000d24a1f4728aa88ae5b045efa8700e912cf7ed750340ccd5d0a05fe1ab +size 703670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5b0924dd2a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19cb83173e674aa7111107ed8e945d6d2646bcc9a07fedbe05d66cf6396202e0 +size 697418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8cd5474bfe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ce25affbeccdc0e0a6eaf19e66a362c92f3918c6a529500394e62bb06be9cd9 +size 591011 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..13f67986b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa82820d8f8539d0186581289fef33cb0922df5d166b37b438804f84f84022e0 +size 498847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5b9da60781 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbc7c945fc31120dcef87e9f1783a05a20d831ef70082908c1e6deea2cd52480 +size 663088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c379eedd50 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62dfa62c9430ef682dcc5ba63ca62d5783a3f1cad5a1232dfb3f816809d46b44 +size 559675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d3d3eb118c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:301c94e5b69d359de5b68e59c28396c4734fe69e42856c0111f8711371c4ece9 +size 682722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dccf47bd33 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1f5f05d4fc75fbde6eee7f7de17be892384471c7e17285c9b498af0db05b9d9 +size 598451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..09b923391b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8401ec78add4b75e9671356d151fe93a43f98af409596789fbff0acc2bde1d71 +size 561705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4157678f60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:512cc6a04ec577c791eaf1053d7c590e8d5de34027d12136028a78df440384c7 +size 469393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7f92f56456..f1d65d3871 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8971b87a9ee4c0c57d09728d191091d58accfef79e31d56958af9ba07d850fb0 -size 790662 +oid sha256:c8ad0cb35f455799d3baf5dc953a8a2c9640ea17020e283203c3269ea31f5bfe +size 777390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2896eaeb11..ee198b9c12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44a4bbdead9b35717bc0fe909a316a5766f25a24038f9a87fdf8047dc3ed8e3e -size 828770 +oid sha256:bc8cb459149b0e7210473894636f1bd4ce7224a15bfc05e33f631ef353c52f77 +size 814908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d999dcbe47..6859f44dea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:580d87ed0b0c47f523b76e0bf51b796ec2ca8d061d08a7add690fbcdf4e72303 -size 733280 +oid sha256:ad4da540d05ca13a5d7766799b76c122957946124aa7bfb2a0683c1d8bf07699 +size 723660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 5c2e6ae401..15d5965cfe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bde81f59dd045a18cdccf0745e957f77462b2c208305b2be54cf2e5dab5fbf97 -size 771932 +oid sha256:7cf0037d0f9ecd96a0081dd69e8c2437a8ea3d5cb801af790d981fc07675b71d +size 762412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index ed0a95b899..978fd26360 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a85548c4023447931db1b7e305ea44f8174de8dfc51f57876a5c0bd6ea74a81e -size 722956 +oid sha256:edd08ccb1742642e70094e6de13babc00ac77e793f9fdab87bef7825201e2bc4 +size 714964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index dd1470709e..883ed75397 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8320f5700c857c6ec8db7ecca42bee4c61a0dcfbba1166294a41c91524a0bbfd -size 636270 +oid sha256:1031d8fcf05bc69a43bbe674f5c7256628fd8ca1f188b68e59d61480e87a3978 +size 628178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 0f0be4c7da..068f672685 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a975609012c95bd5e36801ca76bc62bc2fd4ccd42c705406ff37fbaa12b6f684 -size 764222 +oid sha256:227e11a2d8e26660acab059170fbc1d696524bf561cf2c73e58e143d808f5f99 +size 754060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9455f1d278..9b756f2273 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ece9b1d6eb03d77c56c5efc47891f92d13b3a477da15f529fab90a0964e0ab2 -size 675070 +oid sha256:256ce2907da6aa8b0a6914788747cfd11da26ab363a3c12ff60b21e606231f1d +size 665400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 34fe70aa90..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66fa78e764e664a336387a5de2e6b06d2586c57ce99f7331973dc98d3e522ce8 -size 569913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 8332d2b2ba..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce3c5e8477ec7c5fcac0b04870a9626d88b9564d594a30b8e2b34af5b85f50e8 -size 548205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 03c9fee981..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8170352f6545c416f50703be6b7cc48ad486123a1d926b438c5a73896a61be33 -size 560781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 72d68272a8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:857611452d5b9bea5632871586b76448657d3104766331a8fed9414b3907604d -size 540355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 655da91a55..8598283066 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a36e338962a6acae3f5f8ad45c2f06c672258a71607ce5a2fd4016fbe94ad73 -size 760732 +oid sha256:3f2512ddb9d5dd27eca82d4f506ef379f0c38b854dee88b9a9ed27d5e54d16f1 +size 747412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index ce479a037b..956b96d0af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7b3cd0ff810e65df7494175f184dc8e6b0c752a93ee2b61b3544dbbb0d0c878 -size 676116 +oid sha256:56b020af95508b7a56eb320b1161d53d704dc4e3dd45b98c4e0c15ec69d59949 +size 668618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 434a7c9cdd..2f1d869b90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0432c5f72edb982ac69264b44d7aa713d452968544ff341f395942eefb031be -size 797114 +oid sha256:ef8f1f4096cd2ff01563df856c87b134d47d9b7d9ce67116c1835567aadf93f8 +size 783004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 81d039759d..f99dd7492d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bd7ebcd386a7c2c823a813e779c6f8b0dfe2c223f09e3ac040e25df7f4a2440 -size 711414 +oid sha256:db8eada4bc384c5a6ca3fe413e9a2aaa624d0e3332c5b97f24f1113d38e85c50 +size 702288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index d6696453be..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0cb98501cd2d2d6db8b5f075b7bbdde1337d4852482afba188c8447b46ba9169 -size 487301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 644e35aa2f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7acf6b060a072bd4383b31be79c63032801bdab369a55418042dc61d9513f5c7 -size 560963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index c4454318bb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5aea462214367de6d1cda5dd851f2941dc6e80da608b22836213254dd9962d29 -size 463571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 03463b0637..3b3cc4d142 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91d366a9f1eb1e2a79668cbfc59a77e78ba3b5793119b427f05aa43c28778eac -size 846982 +oid sha256:9f56c49e8b6e8945ffa5eb9fbe0a880137e001947a07b40f196a4e2cb3da1375 +size 830900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c35d82c12f..0087c6280f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:050ba29cc56e535ef95d5f43d3becd4f27cc1fd3fe1ed4f1c6c57621cb1b5c3f -size 689558 +oid sha256:e8730bf6350d14d4cd77ba3f738deaeb9bed7d7684fe45fd955bebeadb4eb268 +size 680234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 60a9a13118..c48e0a9699 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bd90b7c22ebf22d99995d1f69d0be0103bd31a3835eb462a58fab3673ec277f -size 759808 +oid sha256:d572419c17d7d0a667063a8faeab5f996e97971ac866a80a266316fd9b15ad66 +size 749104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e8f845cb46..a06e6b13ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cd775591bd7ab9e5c0fda332292e02768a7b03bc85d65529217bcc7e55e46ba -size 747772 +oid sha256:943d88d35593017310cb758fe20072f32a6bc4df2b45e83ec8b7184a9cd0f6f3 +size 752656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 28a7489b23..659ff111ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d74893c3148a6bd18ddcda16699745e3019970d0a9f593bb412d1f21dfa16a1 -size 661486 +oid sha256:4d352cc9bd28810cd6b39545bdf6204363a7d12249a329916ddc599952b4e7f9 +size 652162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a3797bd59f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36fa412ec821156639e15fe68c0bc6b55353b807eece1ba9e8813c9497751f2c +size 868368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index ee77b2d7ae..431f17f6e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8daebf52e2b3518b976e7ede38dd2b3e1279e0775bc91e020f6fec7dfe92502f -size 595403 +oid sha256:f4058eec0600ef74aef61c3186b165256d0ea4c2b202f4f7275fb1a146fb4f7b +size 600535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ba10ed1aa3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11e87e196b9c11eaa0c6989b07b7e3f6f2fde2e0c9197444d018616df4e21b30 +size 660080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9a96f722c3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c4a28197493f5c5ad15e1ab5faa83a9d767cd01149a14d63dcf82ea50636a08 +size 775816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9803d90f6e..eadeb32d64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14b15e3b568ee4638ec40a65b902fc001814c254004e85c0db30b0c318decc23 -size 573745 +oid sha256:c8ae5a89c474375914b98fe67affd4a640e32c89376af698b3f0cae2b0fc31ed +size 574781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f2da0a0cfb..77a2fbcd08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37b26740e40462d848635137b86502c077a4c69f0a7254f16147c67251daf008 -size 788368 +oid sha256:58d8d6dec54848ccb1e2ef96a49b3fa640aed950407f219412526b22b43348ea +size 777564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c78e6a9150..a05f9c0f06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88ee5bdd83ae62b0f9d0c409908dc4ca86d322c5a82273273a626fdfafc113e3 -size 680524 +oid sha256:9dc7f8c950f4cee6b8ae0910ee7e4a6fb3c08bc0c3f7432a2f83cdaad0a1d1b4 +size 671990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 9f65efb940..01a9de3a27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1947fc099bad953a4b79a8c3cf60ddedd680b412a225db4d7de60a4125478619 -size 747074 +oid sha256:6484d3e936e159bb53510ba527aa49163cbf668d07c4d6e820339c670d6d265a +size 733508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c792ca3b97..c5f17d4d01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97f94e9d6f0d2f6414d632bff1358c833ff5d68b61a821c08a93edce6e4a98cf -size 711604 +oid sha256:324b23b5af7cd08c95d3966474a860117ad74dca3cb1b958d5b9c5ed71fb8d7a +size 699566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4594295133..3d5222805f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bc9d9c897daec5e0411f64bb626e16fcbb4c7a3cb9730c324133ce70ec1da78 -size 654770 +oid sha256:4b4b17922f0c9ebde7bb395c977fa12ab2bcabcc820df597bcc2ed2a17cf9a46 +size 644706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9028e7412a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d2930081f6936ad29669f6509809b259b2c070ef6296443619d54633cbb560c +size 815822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 575ff6d403..bdacabb7d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61aa835051da2dd51eefcd70aed565de16843312a8e7d249382b1ac7ff96bf36 -size 586271 +oid sha256:a2c3de662ac787d29b8abf152bb03bbe219983c2cd02da68a00984b7965195cc +size 591057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d843b7b6f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcfba92a7b68529a34e23e15f3b2403a5fccacecb01408366901926bc32c2fdd +size 643992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c80b53f5fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24d8db51ac18fc1b3744689a11d5a8ddf8909443c47933a7d72b530588a70557 +size 722778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index f2857101da..acb41398e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9e9b3d4e08f60b58a84386c5c8d27dfe2c281717c45b6e2b3d81eccbcdcc7af -size 565007 +oid sha256:7f91a86278c5187b9101b1a1f6f9f8ae70bbc1ec27468202691b79d39054e168 +size 566091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c7c8441758..c484051ac3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98a24476a66b9f7366a947de91346b7f0aea8346f0c2a65ee2326416f38364bb -size 840340 +oid sha256:ea43da8b2ae1eb7d4417dd48141e9be57c23ea581edb94f43f554cbce3304ed6 +size 828402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 0bc2f83b8b..553729090c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e73e5e090fd44e70f721837cdbcbba158c93e199574aca3bd544c2e29b6bbd98 -size 792894 +oid sha256:e9a7756e02279a2b9c786299fc2764bfe10857ba4f5caa00a61a3e579136916c +size 780116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 77902d37ba..da03dcde35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9d7b871e0b0340d0d6d544b282b9a96502eab4769a02cd935f98c7f0e7274e2 -size 748474 +oid sha256:e766dfa0dc24f7cf55e9910cf3ae0c4545bcd9429a26fa54adf43d7ed7262022 +size 741666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index dc933afcc2..c4e4bc8c21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:674d68539df79239f70d412bda8082153cbb0b2000367d454008bb09080680ff -size 698806 +oid sha256:55cbafd4fe416666ffcf2b8834467d480871c9d67da445c951a9c134bac2f13d +size 691406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index bc7ffb9332..4bd58541be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e774b19cf755e6f6c5847bd744cc8b09f511a9348c10d25b599dd3fb355cf5e -size 706656 +oid sha256:18794fd6871976e27fae129a5493103f95e16b43e53731f7ad5c74c2b9969a9a +size 693732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cb0d1b5fe5..c611d65c1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9849a9af75d7c866fcfd1f79c1469367ccff60dfdd613b5b68b6c0e5c196810f -size 609411 +oid sha256:11f90e22f7f8f930bbf2a34e66c35b3f848d66a1ce1fcecf84ee3d95c2c894c7 +size 600335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 7dbb941134..002529cce7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c42ba5f7471e511914a0f7069323635f07c659f08370c79833d91e945bd1d15 -size 783764 +oid sha256:590d95155ecaad5f096a99657b3f73208b2ad957b7932ebe1fc2858cb36c429f +size 770100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 1fbb7ac8d3..3281a86196 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0598b34d221745f1752a373499a07bfbf78c6cdee9e885645bb1c63e8471512 -size 689136 +oid sha256:9940bed29bdac055f0927e920542995838432b0c356fee5e56a81820da00f224 +size 671770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 321dfb5843..566fdcc15e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4a0155bf691ae02efd022ba2dc2249bbd9cfd4fe18630bf99ec467e77d7dfcd -size 708088 +oid sha256:ecc784c87a85e8440b3e634902a704b0168b6b8630325b69e94a4c6164e3e554 +size 694916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index fcd2bb1d8a..7632b0b0ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7e8171b069dcca21f0ebc77f0255242561f4fd43f0cfbc480307b07b28ea517 -size 608771 +oid sha256:c2f0b6d1957d86f66224ba3d6990f7a39d96a67550794c28613fabdf43ae0444 +size 601617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 30b62d6115..b3a1b7e07a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:137e5fd456cd57802fbe36171f942096e2c012fb239bba3e47c3d4b7dc8593dc -size 667780 +oid sha256:17e152816ee77d75393d6f68f4e22084f569596d867f60620cece586d955e7d3 +size 658012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b7ca57a321..f933172332 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64d899b109ceaf419af5c247baec1a15caeb6c5e29fea96880d3ba17dc29faf5 -size 571817 +oid sha256:7432e7925bedf65072deda79a4753bc936be3ab26aa2d03862aabb38de1e81c7 +size 565109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8355966b45..5b940aaa16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db5e3bbc150c3970edf58defbfc372cce556109d7898aa8fd91ada43b671248e -size 871246 +oid sha256:1b4ce84b43d173909fdfd7a00751244d95b64a12136dbb75087f7ed756353bdd +size 858568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d446de3d27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf4d65ac2145ab6157488624b8bf7b7fdabfa98383fb48cd3d449a9b6af390eb +size 818274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 8c5a3ec832..2353ad1602 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79df818eb47542c73e89a916a07398424c2d244e5f43e1e5a1e87f2f4545895b -size 784560 +oid sha256:36d674d18e6c6780a29512187edb78df6792af1abc6f8aae36ded654ede343d4 +size 775680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b4cc50509c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59c631819408ac41fb8605529d45d840dbf884f6c21c4ad8cd06b0af53e58301 +size 728530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 619aeb52ee..63580cab85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fc15b3366cee0724741adcbaf392e15eea7865d47b3ba6fced2ec8c0e89c3ba -size 611911 +oid sha256:a5f87a7ec418f3beb38e2f9f1a7e67904821d886ae1cdba37653d7c13f27198e +size 629228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index c809d68df8..67babd5898 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:135a4ad86a938d57393d7f15cf22d390dd8f335c8874c9af7520b7bfd51e3259 -size 510227 +oid sha256:8c36dc42eb246a420dbcc1f52823c9a74b45ff8f8f37e1c47dd3220ffa042e2c +size 528135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2d4ca13003 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddec34ca0e75c0cffb2537941921ba68bc155fef577543866fb6cd2c2297eea7 +size 697308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cfd2db0e7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1db07e1e12c962a02acb36a426eeb7a7bb88f59da9861bcc72d4da176b97eb4 +size 593057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9b3c340033 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9488bb00737cabf0c39f747e9faa95c0e7e0f59ea3389829e395aabbecb0e638 +size 719064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..52174d175b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168f32c88f8f3c9e96ca2055a4c2efcce8e6486119548c3079cf67c395112a4d +size 635238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3bce4ef2e4..f8fa5df023 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:648feec11fdf8cd832ccd47bb79a3dba983567980a658e7f6ad0aa54a600dbeb -size 584529 +oid sha256:efe7f513f327e17f904169339dd208bba151ef67cbab05b0169985db6d014a6c +size 595283 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9ead3367e5..866171fba0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c95477a6b48973ba7a9d9b82a82add7fa24e2077b1a598cf2cc17a837c060f40 -size 485705 +oid sha256:a63a64c6f536f3faffb6409507120aea7d8b68c66da9d4ccb3b8bef72241e526 +size 495721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ff3055ae10..54050874ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5edc98a63983a350fc8854d25497b1d1249c41c609a37733444a5fe804a63a61 -size 761684 +oid sha256:5990cdc426188f6bb39e00b155f4cd606e60208cf3ab01d5a5ea0f775d9600f6 +size 753890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b3bb43dc19..9a5b1b06a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76a21f3514ab6245b453c28be239a1272880211b828676083ae97978002f119e -size 669720 +oid sha256:618ddebf539f89bfe685ea6831eb5cae8fb8b39024e1dcb74f7a64c00679cb9f +size 659656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 236c955ca9..59db396dde 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:554e885ab62a9e9304bb30e4a54b43a8f32520b478fef8c78e6a238ba096ba25 -size 789780 +oid sha256:c6f0ccd5a2cec926344b3f10d8caad0dbb1e6a849a69084caa3aa15b7ca62c38 +size 780800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c2ffe9a8d8..091b26178c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5899c9fd5850bb0c1beb13d6ed03eb07d70167a316f91119bb1814628e979d6b -size 697962 +oid sha256:f595bc35cad2134eb2260e5c7aad1a92a9fe31b19a422e657823df0eefdf71c7 +size 687602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 4feb94740c..53126e9f72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85ffba6fcc0f699c4da6d9c5161c9911e6de7c650cdb92abf8089f91deaa117f -size 768786 +oid sha256:f1f760001d55850dccdef026475ca03b1eeccfc36ce263f509944f4399305326 +size 750534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index bd21050fd2..2ff65a910d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc1d42acb6a9391e85860e01810c2a259483e46b3593ff96842e6d0ff6105139 -size 671148 +oid sha256:44213a0f89091c3550b8cae8f4ea06649d55ccfc061058c8c222655b33fcada0 +size 664686 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6aa0c8d1ce..bdccf0b80a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b64603d79ea4270007075d5a5527d158051349427cf57456594b9252bb6f1ea -size 787854 +oid sha256:86550a844e89e3ba2f75bc9c2968d6c946a70467a69435a65020cd6f368f4cc8 +size 771820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 8f513601d6..8c8c533ad6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93e5bc9dd0dfd52b8798f1d86b9354d6ea875b77563658e625395bb00f3158a1 -size 702746 +oid sha256:3f645a70deda07906f6440e216a76b030717587aab6c7835dd304bc0f9f54c4f +size 692730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index cde70d3ca6..aa1622f52b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f84dd4bffca8bfd692c66ea06c7ece9f0c9cc009f4fc5ca460f1a5da20cdd090 -size 866668 +oid sha256:98826657989e2765d3b9ececec30bbbfc823af0bfaa76d883b7abaaaf5e46d6e +size 855666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1e17ec502e..5a0564d177 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a4df8fc98cfddb501c84364e9adfaacb4ede0efdf767647b2990f6cded0dc56 -size 774308 +oid sha256:1d389dcccc7d1a7dc94fe333555336fc0622d0e7976b9e3d1f626af78d84dcbc +size 760692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8269434f0a..04cb0f085f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97a188c46ffba0f035bf3ed73f256c0920f442f62832063d243dfbddbf09f532 -size 895354 +oid sha256:7eb757a820ca8f97852b4de0bd180c66406c05ef625fde2479b6422b3d4cca7d +size 883564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 39b78d88b7..648b28c477 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf62b874f95717a59af6c5523a2bc8fc81b8588e3898f5835eaaaac8c6cbe5d8 -size 802550 +oid sha256:e2275482bc975b3ff2313832052126349fee4bb56f7018ddad18a3e1bc312d94 +size 788688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1a8aa43b19..0a30cd1724 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66fc3bd42185e2f73ab36819bcd1192aaf23d3fe081c8656edf9d923834f31be -size 865532 +oid sha256:52c41cf3ba0d8f7189b698c09f7e3e996c6fbcb9b622638d6503048076a6220a +size 854086 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 75852a7990..61a8ffcd9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0d7d26ab986023794886bf384ecc1949c1597d8b4d08d66f066abeb24faec12 -size 793644 +oid sha256:0869a82d7679c326fc1be41ca516850173020e6ee877815a573adeed09c230d4 +size 780916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 51909499de..34d3f4f5bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfbf6f159fe18f2c97608217c831363efb2de27f0f89b7cbfe471e7bb357bf77 -size 894662 +oid sha256:0db51e764d66a6336e14682b38a2e0071fb255797d796b21846973a145d27690 +size 881194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 70c48f66a3..54d1a7b654 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5be297a6b5fb6b93b460fdf095c79fddb365ede326a72fb1fbff0285b26ed3db -size 821936 +oid sha256:f7f8e509622bcda4118935880097ab4e056e593749f9e609b49d8b8cad4e7e69 +size 808172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index bd190a6f11..5f26a2fd1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5eafccf69d6d74ea9c7ed18f003375e26e9400944ead8354796d2ef44d466a70 -size 762478 +oid sha256:dae7e56b3f7d43cf012b21559545370321bc27d4e3db41f6ec2c566d455341d3 +size 753648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 20ac3b7b39..d23fee7148 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7112ce4228bd30aa13ed2efe94c1881a4080cb47a0b3d1d14e012b0488470170 -size 670512 +oid sha256:71d47929eddf8ae0927d132e9019652e926063642cadc745e0e5a8c950b91d9f +size 661238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 96fdf2d917..6d76261f26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ea2605e066b75129388f9529978c468e970c24d45e2a4b6c5b44477b8bc9b9d -size 789636 +oid sha256:fb74c3c45309ebaa35e4c9238d0f501dfc6de6ffd7bea739b0f30f224f21a7b7 +size 779572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index b56e64e69a..b12a4ffd93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fbd6aa4cd68cda22caffeff3f3eacd4fd8d9c880254c4d148f5a65429d3ac0a -size 699544 +oid sha256:d2ae3cfbd419dba03eb4aba2b4dd04e55fb122d94f29d9a95ba060576e272b6c +size 688396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index c3ec2bd69f..266904e250 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ea196abac32f1770706a33c568f3e0877f3d1fdd862a36853e2e79136091a9f -size 769580 +oid sha256:b177ccd59871389e5422af8f8c2716c3a9b57a9c27fc06f4a9c1f761a7aac194 +size 751326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 6a503c529a..b5401d713e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d31e0f0262c921aa2cf7d7937d6521d9d89980b96d3c46967a4d372d8c96bde -size 671942 +oid sha256:224c5af09d4472a71d9f09125335883d6524313e4d9ac66f83c8bb3207ed97ca +size 665478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index dcec6fd667..776ed2edb7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2927270a808367b84409a377a51425d53ef45b7803011eae711570dc8d70708f -size 788646 +oid sha256:ff7016610347365e5a78a87efa17564de4648a8fb4c6e4df4a773d50a2136788 +size 772614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 48f17fbfcb..366a08d159 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bde93be61c8b701a8b76c9c0f7e89db9a39efa5f271010c0eaaaeb246f47631a -size 703538 +oid sha256:34ea6417aebb6b74ed00836c7a3aaafaeab233b8b89e72d8bf374a348d2ceadf +size 693524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d85da79993..cb622cda4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0f183d79f6781357a85d5bb975cf1874ea76e0daf163822dca219e838e1a677 -size 956356 +oid sha256:0a01bbdb89254df0085f43f9c2646d2add0dfdc2894185b309df95b2cb0a7c09 +size 945306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 81eb1f45e9..9f1a059bbb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2be4f49b1c2d03bc20a398347eae8ca74c519d2513e3022bae58f70bcb48d537 -size 893850 +oid sha256:a10deb6bcb819847543a9e25290bf49619d0b9ef4fe7b2811067473bca32ffb4 +size 881614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4f29c9bf19..e3598b6ebc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8502e30ca0ae2eb76e7e41735010812acceb0e136acbaeccee679ccc430a987f -size 909044 +oid sha256:49c6f920792d8ffea29900517a00dd82105c62ae9d9a3e4011d78ce646e85c71 +size 899030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index ae705fb887..6f91c6e9ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afc7b758493c2701159642595406aaad09155fda15c1ec8ebc2daa88d3be33e2 -size 850040 +oid sha256:f577bbff2b7dc899e816875f90107e1010270be18992a787d9ba54ccf7ba38b3 +size 839828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2d52a4e037..901d7475b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3d15e89bc1431b876c3b6c44d2eb72a89c3feb87336d5d3b19446df67eef598 -size 734924 +oid sha256:fec76ed70711accca4e4a129e28fefd194995a3f4a5d969b79533646efe3bbe7 +size 723428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index f69077c25a..970b31076c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba3d6c34402dd81aa1cd761b10654782eb4a041d65afc49be9497485b00bd54c -size 708676 +oid sha256:59f3331bd648d3d78d731531f72aed4b02ad67432e81870f9193353311ccb859 +size 697970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index e032d805d0..d2747767eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe1a0617ea02ca758f1cc52e5acc7119bcd3e47c085cf480668612f779dd57ea -size 695258 +oid sha256:a138a48366fd2357af48306d57abb33bcf6857bb807d216681bc8aadc2fb9d80 +size 683862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index f8ad8b3365..91560c733e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65fd7773db121a62a6ae19aab870212b174f4332b2664abd882940ab875b411b -size 670194 +oid sha256:a1c3072dfec58f7ae0c49547c25df01a9811d4c0c75bac928deae0fd9111b65f +size 661956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c0d95a9d8d..52d2461955 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:331bd6f71dd1f084c47f5e80d5f72dee18b0a3a233bbd9fdd5b845f4927c30b7 -size 750004 +oid sha256:8435571bb10c3a080400f264fec29673e147f039f75002279eaa9775ad0a72d9 +size 743984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9852d9bcef..d7160a08e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7ee0a71a829476767cd64021c9c4f67f60d539ad186e13f36a352e6b1326d41 -size 759548 +oid sha256:6bdc31f33abb6249262508bd3cdb6ae681acecc5980ed8048bbb0b8140c797e7 +size 751408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6425413e8a..37c8350227 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5618fec29b26336a5f17c91a0731e5beefb066bef107730617121ee468a19c3 -size 945448 +oid sha256:0e73b4b2aa38ebc152f41cc97db5be425ec44b493e8f3fe28160f0a1134f4d05 +size 931782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index b776590c0e..a2c9d60d6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:044e11cae6184d70db4dbc28e9d6ae98eee5a7accfb573152ce913e8b3e51028 -size 882990 +oid sha256:3580ed5311721abf9b1c608edb261d418603765f01528aa774fc6a29ac61faad +size 868042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 502edd7f63..4f262babda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09d4844e7aa2a93f7565b09643270a4730a0a41124c3498779a81d4c779e15f4 -size 903908 +oid sha256:c0bff83cbddd6e0d698979285099ecbcbd80b9ac4e7e0077cba6801325530b78 +size 892956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 160edd17e3..f0075a859b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1bbd492a1db812cd00e0cc447f2839350c6643d58502caa7642e3d04ba3727fc -size 844114 +oid sha256:e360d1065f5fe7979c2bf35a415373e547c5d726788b9957d3ab30e38c66f079 +size 833802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9a3242a520..353a9448fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bfc934f8ef3dd667d8179332bdb4d5e2586ca186ffd53f557c51f76389ea771 -size 721646 +oid sha256:93b3c9ee57f79266c177a66edfad94790988a6a9c985af8c857cf5e04281e106 +size 707438 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index bcab5f9f75..9c580488d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4753cbe5f6f0de314bee645d38e8f8edce4d939827f725c68b8087946643a60 -size 695448 +oid sha256:30e0be62b0b7cfdbe8f55ec1d7f5d4476310a8088118ef0ee4afb597dac11bc4 +size 681980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 210937be61..527794a1c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:035bae20161f292af84486bdae1b51fed28ab60b046bd650f030f8b29f0063d7 -size 688000 +oid sha256:2835a806c1fcc9f73afebe3f4b51939c4b3a4d379cd1a34c4e6e0adb65d1d0d1 +size 676060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index cb21667402..81ca693114 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3f3b34c2a818d311118c0160b682c97fdbf4afe50f78facf9245bb07e4afa0f -size 663626 +oid sha256:d0bc433358a76963e3f56166166d40b7f8427f2f0b01e183925b267e31d1efec +size 653218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 31a41bbbb0..f3bcf245e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30946125c70e83f71b244f00f8ac48c0a097cd561cfda1c24b046b0c1694536d -size 953772 +oid sha256:eb61496be336fb48d13318c5f93dce34fba0c22d4b361dfc934bd498cc8fb3b5 +size 937640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c0ff635ae4..3e95d23ceb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c00ec2adf488714ef5eca646ffd973ca73b6e5f121ccff4596a2460f83bb1afb -size 863336 +oid sha256:01ba7b268ef03e8baacd63117c8aa720fb57bde1387b345a1e6cdb124b01406c +size 851100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index a633512d0a..a1b5d2a077 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:308b0acde95a263337b5839a155a9787aac5ef285a806866e29fcb83e3f72c84 -size 894322 +oid sha256:617cbd3907f663962c1018b7da60f442a4dde358a4b950bfc86671162d13006b +size 880262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 4215d14d27..0cc18f019f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40ad66783d4c4bbcb86fa0b1e650ea11a23539ac99275ae43cca41f4ab2cb333 -size 800680 +oid sha256:9d0aff3e06053f692385152dbf202e3e544e716194b202411be1bc3c45eafb28 +size 787064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 73660cf371..9eaec46388 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3160d995640b4e602cc840b82488b205757b8c63a7d05f866f01d414dccf372d -size 768228 +oid sha256:0037d5ee855c7c4e9bf459fa63794d94bce95e185de5dbc18901b27c87439f28 +size 760040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index ac8a3b6b49..46272a1413 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac32bb27a3dc7f22f089727ab9211b063dc926111accb4624dfc5fc8c55bed04 -size 714348 +oid sha256:eec3d604f76635941ee7b73275018c8e5da65ed4a8037ab89bdb0ea945c0b1e9 +size 706456 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4b5cd90175..c08da9d2d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c765f7645c3378a9462f28aac2d7f68d3acc414eb0cf8ed66650c74c306d74d9 -size 915388 +oid sha256:f141c825bf37a9cf44bfc9ee28b8fff506f2ac3dd4de4bc9d1a528df40a5c96b +size 901278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 40d541e009..2b8ed900b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84ea959c1037a0672177703f3ebe37c8affe35b0b6ced86b20d442a6dbb571ab -size 825742 +oid sha256:ea22983b82ec643b62060704940ee601969d1eb756c35da643537737c037ae96 +size 816960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index eb1b973ae1..f88f77f21d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0af654bcbe175cfc5b60b3765d7cddce0bd6ddc118748439c1c98c49803c4ef9 -size 859442 +oid sha256:577f7049640a0b76668a7528be29bc5eb2553fde46b81fae06416aa49190b9a1 +size 848342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f611155676..94a94b0c6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5b4a1a0f4f4e0327d46a78086a18e7a53f97542ab0da0c979b6acd3c4d5855c -size 765996 +oid sha256:23d409027fb3134e12d8783d8f55e95134ab28ea851d3680270971cec5f23782 +size 757116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 02be524ad4..10eeb99859 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:970461770a3237f928ede202e4e8d371d15a5e07f8645e543c4b483ae59524a4 -size 730908 +oid sha256:9087fc3e027dfac0305d0f4b19dc59f4db5c050b7e9ef2878352c74f0a099c01 +size 718376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8e149ce4fe..c537e7b990 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a09846ee076872c3904c6bbe346642a38349722d0c3d4937e2c1b22e25a1d988 -size 638696 +oid sha256:debd070ee17d1affd02549cd029a035f693af7af43e387cea14f79cf850e2354 +size 626756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 67628efb5f..c19b83f636 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16f3fbee451a5000df043a9a5ade0a9047a22b5dbb41b9d7263e3e72c8d55586 -size 704216 +oid sha256:2644cf0d63d845faef91e2a8a8598fb4be7055032e6d4d19ad37a7980ce2f415 +size 690846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index d26306008d..1ef20ef30c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15fd1e9a53d59d4e75c9741533a12b986f57bd55fc6946f6605b63dacafa1b9d -size 612941 +oid sha256:545262ae580bf2b21905b63a54060e3c4947b503a9026dba02f3d399e463bc14 +size 602333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp index de14bf7588..9c3581e8df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba58ef4963a26baa4e023ef9a7ca9840c46b39ba3f7a6aaf122b8f92720118e3 -size 794054 +oid sha256:b531c9b24f83d4f1089da2954fca7aeb2c3db4f5196819956aa3fdf57c60a14a +size 784236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index d541250efc..f630edd0e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b1b10790c025efc61efb597b58756319afefbe76fa393d424bbfaa2a6371ddc -size 723944 +oid sha256:5cfa02195b6f3137f089c8ed39233abc79e56682c51cddce61337dc2d3e88494 +size 715014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 35ce620c04..6429ec58ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1895c55279b6f51ed2bbe4f00c5628123b2af98ac8690fbb953b48c66e39bf23 -size 700614 +oid sha256:a8e25be92db00d1f7d48e619df69763f100d5051cc76a30e28c84380b1d28cd5 +size 689416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index dd0fd66062..411cf10cdc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb62406ac11da4d09411f4d3e3a720b3067deb5917d3c6ab1d9e230a649ec0fb -size 610425 +oid sha256:a0c78849677ed83ec4f01e3a36b5e415ae581867dad36df9af36691b9d5aa909 +size 599077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp index eb04bd78c4..95d0fe594f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9ce963284bb696e2f9f01e6e424403db1dbc3ad22c93efcd56106f27991d6b3 -size 677130 +oid sha256:8fee1fea2db2d0bb1740b6ceba8341f58204505bf7cb9cf8eb34ea0e74464b5d +size 665092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 0ff732ab66..3c78ef9074 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e49981dcb72a258436517fa9e6941bb525800e2b2e46689d173c63876bf3ac16 -size 587285 +oid sha256:b4c7d1ef994ba90f80259de1b943c0827450ea5fcd951afd2c896a79b31d8a95 +size 577271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7d7ce69ae2..7f9ed6b1b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac1e92299cca0c518ce6d7b280b9182ca2105fa4c9fd73d744961a3af5e4e3f3 -size 885662 +oid sha256:e1624ed8aff66ff3f4f6e430c051f63a56a161dc6fce3f76aca2ac9d4bcd5967 +size 874414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f62f3a800e..f64c395097 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34636c6b9b2fd3bc767d1d63aa3790fd2dd9132ad11aac3144d152a7b787f05b -size 845256 +oid sha256:775e2097b9379505da96c2d8db212d64fd3cbbcd42d0fabf1bfb7455f7db852b +size 836672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bbfceb2f48..479e8eedda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fc86cd7997c9ebf0f42f2f6891d73464faa575abdb79faa0826c800621256bc -size 872534 +oid sha256:f3b6702e1758f7d5b7a4834c1350517e24104f1914f1f187949d5871da6654bd +size 859410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e874e99568..288255083c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d962a330856c9e0a9c4f71c7ca27746cf5036dd09917a63184e9dc6f372889c7 -size 841500 +oid sha256:7189eb4ed8ada80e7059a6ff3e731a30ce077d99235aa3680ffe210d7e4afeb3 +size 831832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3111ffdbc3..a95958fe67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc0418c01771d7a439d71e774b5412fdbeddfcf75c55d2fbab70cdfc4ab24d76 -size 895706 +oid sha256:c74d47bd07bba406faebbde0aa279ce38609a5dc7a2148762e05a9d103efe87d +size 884902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 738a26e526..f08f08fe00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b690ff1fe670f71e1fb01b19be4f80259ec6540193ccec92668274fb6123875a -size 795008 +oid sha256:67ffb0f0e623c0548cefc40c039fe1f526a753080ad6380ce9142fb2f162060a +size 784648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 395780cded..65620c287b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1ee4d639f32a6546ae52f6d2bac85121d821a296767b0b2acab2d7e18dc76ab -size 865906 +oid sha256:26f31084761c722b29187e3a2c735e9ff11247157b78b929b884df242ba83b00 +size 857766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 451e94c2af..480da75485 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e67ccf044c653d04831fe5cec359d642d84d37d7e2cf054815885738d543cf7 -size 768614 +oid sha256:49c449b5952ea7740909ca2d0166fbf961746197a899ac600e2fe7a20a2435a6 +size 760226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e4b016eb25..265db2ca65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ce97e837f57532022d628c9e40716ed35dfa3340999e53b7e35814c1f9b9c99 -size 1033416 +oid sha256:a753a720e5c882bb7ba26b0e17c7872de621b99d6a2a9919e148a2900a3dedce +size 1024042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 46f8690499..a56c50a8a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11e79914c441652c18968200206d65c1c961aa2b4a1065d06ccb0a1b2bc5f1dd -size 950780 +oid sha256:a736714ad6dbc310963dbb7dc9c38ce7e5665b407c4aa71cb2cf3757097aefdb +size 938496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 863fcce8ab..e71bf0a40b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50b9ca1d61bb022bfba10f9d3958f35ecd40edeea3d4453dafe32e58c3072a31 -size 990050 +oid sha256:413308fb4a6e5c50a2d1e57a39da09af56d20eca3ba77b1c3edecc67c30e139e +size 981070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index ee173641cf..b3c9d4bdb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79d403a3e793e1c80fcf7ec0773f2657f46645088602a9f1c3979b1787a7de96 -size 910916 +oid sha256:3dd2e97489e64ace567003e2c0f582f10a1860ff9ed2eef0f2b48f9907576b23 +size 900112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index d9e5a6d923..1db0f1a1c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8e1ba70fe70815411791831a622c2f4950e12edfb4d7b8d3299a5bd210e528a -size 772416 +oid sha256:2bab546626e246b12ac82ff45f273c18661496aa2a431b664f91abe59a069101 +size 760626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 8074a7d414..40bc51119a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e494625b1345a28d7902a37b2cbe4b6f6d7d615abce2df8ca3bdfd3eb35acab -size 742666 +oid sha256:bc8b18a878995bfa91cff83747ce33ae6b783229f750ac1fb77f4f71ab9bd122 +size 732750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index cdce130cf2..60bcf5feb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a6120a187a7843f26b6dd1c8479f6d36152bde7b7149973fd924ccaf4911528 -size 736846 +oid sha256:6a60ee65c25f71e06ee12170a83800e0ebf158393b7ebf48714c0db0237573e5 +size 726090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 8d8dfc32ef..e4f143fbac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccf20094f73104121345db7110dd56b1b383f9f96d7d4f4d4e6e73ef92a031e0 -size 709710 +oid sha256:3f23ba87b216b458df4ea06c8bac622e137c2f4d979c721e4cad5854285be8b9 +size 700040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 44a1f25f0c..60aa8cad4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:add90da835194b8f29eebf24d156e202c2e300e49b7354b0da6a605b7b7a2161 -size 845028 +oid sha256:a6cc0aa1ce454292944384396bc24d830dfc4c4ee51e5059c9a7240f52ef49ca +size 837480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 847fa402af..e342e1306d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:697e890f7e4a40631bd53b4b453a617c3b35e706f62d34403374d132597f3a7f -size 824596 +oid sha256:8efe9ed3af384cd876ef85586f99714652ae94a82347dce9d282a11edbadcfea +size 817640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 14e6585792..37a2cfca84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ed8329cd74d6b6a8b08408419240c990ca73433483a39095e8e47df64fc6f52 -size 871494 +oid sha256:ec3a216bd5bb43335df123341625cb912559e45803dae0f1d734ad044e484e5e +size 862318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index b27b3dcde0..e01ce28235 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bdb23e5efc427e786877014febed01afc49bb4e2c689a3271f650fc8994d7df2 -size 839912 +oid sha256:671c79dde1280d8a709969282c40609fa063d450ecb8920c9cc0abffa981ecad +size 833796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d533eee712..395c8700ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8de1373660042c141e68201c3df0a50688851aa5dcb464cfe867f2046d76d945 -size 1016834 +oid sha256:6f6c61c312971a2edbf924e94ce0a7c622b1d43e9f5ccaaa8b2541607c4b17c1 +size 1006424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index b0718b83ae..f20aff2bc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2904cffe4b542915d360eb41d79590d0a58325e74bc475b17a960500b29fdbad -size 934198 +oid sha256:76139e5765e0a0eba92df082b8d4d9338e194cf45fb71d1cfd23858726322fbf +size 920878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4bc3f09174..e18b797d1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c48d7a8be5c9cfa3e74b6981f743272aec35a3977b7e6e44622393e61d0e6940 -size 980818 +oid sha256:d6ab0bd6cc47ba8fe2a7dab660fe4d5a77b090e2a3d6f9e9307b78db120a6193 +size 967942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index a3a870a6bd..5d706d7be8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3e63474beae37c5eb910c035ddae274b4b11e62a80a8cfae8086c8f1651dd79 -size 902474 +oid sha256:a17c41f69fc640dcdbf7636ecb1fb0c8b78a7fefc43162298c5087c448f43693 +size 887724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index ff88c66ccc..d177880b4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38f51327a6ed1a7e2558cbf6fdd0dcbc05f7753dda8df0d0d752300eb9ac551f -size 754946 +oid sha256:25afe1fddc9a8035e9be256440ca5e34c4c698b74d8eb01d49565483ef2361b4 +size 740394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 6a311d9e91..628ff33726 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f83f86f12aa0fa780f42805d115f877d005279e16833b418010f96330892f489 -size 725986 +oid sha256:cc7a77d48b04027d9dc979f7e7fc8c043838e80fa3f573fa2fe494297b108e6f +size 712518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index cfc3171ee2..4ebced819b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d2e03e0d3e577bd23795d5cd3809d4bc7d5b78b977af9778e4f182e2e5fa975 -size 725000 +oid sha256:a65d49ec3deaaf1e402a140c0cd6c45b7cb4ec1719bc0f6769bdec0b2e68de43 +size 711532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index fae46696ab..e802a0ad6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b33c148db7f8d4a72567b249735be37e9e1e6deb21154c259163ffcb1efa4d67 -size 698654 +oid sha256:bd40635338c65ef5deddf71257b804c2af55d86466578fb183c092ba3849bf26 +size 686222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9d3dcf8992..a5d77212d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35047e33b638cb6a5981a535fedb8b195542bf9f37f055420b525f6bbcda0510 -size 1034332 +oid sha256:f89b4484fd7ce3c60aa3339c66a1beea78d18f29588f90e533bf9e05178f15fa +size 1019286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8db27cb76f..0f74aa0325 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cec86323ca27cf5198e3bac6e5ddeb3f1c36c83aa798acf46a3153859c08ebee -size 935510 +oid sha256:ba70cdc42601f2b9c509b34c07933549131c2dd76e639874265f2ce820773e6f +size 927024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 320ea33224..163762a0bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a56f45769bef4c8ef1cbf44a5941d7aa52dc3692a106726dff5de82d724825d0 -size 951550 +oid sha256:877851c7e5026b64202d555d727972d66d2f8d13f55304f23e74628c8383d44e +size 936848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 54c5cdb5a9..ce670ed74f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67641bfc2a236eb967129e8d6793b77a3d49fc6df4b8ac59bb424eb0a15ad74b -size 852678 +oid sha256:ac23c4beb9d5afda813051ce51bbf095ebe4b53be287043de0575f507b8d1f89 +size 841232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 11d67e8df0..1928eebd3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5045d8929a26d4d9116445f0a589702591e108a5889196b08844e2f65582aa8d -size 901388 +oid sha256:f156ff828325117958030274a2dbf94c91c2dc2ed2b76248d32e976e2868b823 +size 890288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 2ea97b8e75..902bd1f024 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e61a7f0768c64af541aae65d4141edae26d80cf032b0a6c362e72d8e347fe5b8 -size 845830 +oid sha256:7d21fdba290c27e7099dc1001c5bb4462c4d4b6071a7500d0776e3e1f778e722 +size 836752 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index c8e7d941b5..784cbfe48a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:450006c41d4422183e670cf05a17e497e0efd601be0f27bd01b1709e33d3e37f -size 808188 +oid sha256:3058ec56644c48d9d83b4ae512a1a2b093251c790cebf7c092ff10c12dea0fbc +size 800444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 88085f64a2..2755c2518f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c14c173974ac03f542d84bb2b294f2b1427aa479181c175bdfe32180f2d451f -size 787412 +oid sha256:ac026321c8e24c82a4aa9cd4cbed4f4bbc94d646e26e1230b4bb8fe1d715cd19 +size 778432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 51e02f2ca0..d075e34d69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f68587d86a19f3df5893c40991fc65860fc81f7ef2aff95aac85b200a1c07eab -size 988550 +oid sha256:3c733251fe4a276152e4a32f7087964fe353b7933df4bdc0760750f4b12ce981 +size 974786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dcd4c85f3e..5f6b677a7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c9e29c26d6f0534d19029af2bd630d032b2209e038bb9431698a3cbfafc9b47 -size 893526 +oid sha256:bec0a37aaba97ef992ec28a70b57b417c5d57679a64b460543e00d585766b028 +size 886422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index e5f1665faa..29842ff70a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14aad8e36cc5d52b78ba6309574412790e9af2f34549eea162523448ff2b9eec -size 909368 +oid sha256:efb4c09656633aeb0c5b7bfacd15c6c3838b0ee18c152d913dd5a308bce8aaac +size 897478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 8da2c8c499..79b88b03a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f50d981ecf66e36b9b21c6756959d2a635c65d98095320aeddd32c6b3c0a478 -size 815972 +oid sha256:f10c6b02e1b59affe0c681736f7287d392a4d25175a2e827bad0fdb89cba355c +size 805562 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index cc1c229119..d618ef4dc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d41f4a773b1669f477c375be7fee019f08c9a005cfa4e84ce6b2f43b88b3db6 -size 765540 +oid sha256:fe3f2ffec2bbb57ad4861f1e8f5f7f5e6165c9fe2776467470006113fa6f1ab9 +size 751776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index e3cd1f3a5c..211f18bfa3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:252338b6a2b3cecb4730430228b7a201f1d3628dfdac5947ab71ec410e35f012 -size 672982 +oid sha256:dee9602aa339af9afe83eef9f6526baa6886ae8d8970baa58c58ed2f94c4f595 +size 660304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 9b0cd0664d..c4379eb2d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:670c0a1fa742de67bd245fa72d2e4a5e637335478fcfea724702f5274d1b1fc2 -size 736086 +oid sha256:ca5155ba56d6ce429bcac3c03a8de14cd6da12d03ac3a52339acaaa1811fb60b +size 721532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index fc3dfb7b4e..82ac8b25ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34b719f2a324841d34f37a071c39f02aec1f3cc1edfca5d9108ea6ca9657da55 -size 644218 +oid sha256:240300a5b40825317ea1e8faa321be26fa93d4fe30e3863048866657a82b1525 +size 632922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index cdd9b1c63b..5bd22b7fd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43e677e5ac4bb50a724672d9955b4213dcce7fa26cbe1bc7d60c950eb81be568 -size 925634 +oid sha256:d954da71709437aa602da1d71d74e5e07749b44a11ae534fdfb57f0965becddc +size 915520 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 5d06a01cb7..9adebd77af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7721a51c6a4c8e2474610c11f72628829537597255ec5fba32766544adf37a0a -size 878414 +oid sha256:f74ee4fc9078f95ecb43da780ba4e73ec8e88257896f672e9ac26c044add0410 +size 869732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 29de47ee40..e54002ce8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b28832b40ce0950a33f3dbdf6e990f3de00118988671fcfa1db6559b736f07a -size 833866 +oid sha256:cb7ea78fb17a2cdf0b12406ea233997e507b1ae0180d7813befdb3ab8cffefac +size 827108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index e8dd997908..58cfb678a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50f5a75a5e5686cb2b054927d7b3e046d5b9195a9d5633d07feabe1f61d537b7 -size 804406 +oid sha256:c0f5ba9ea85d6a6f1ba0304578cc85e80d49a0dacef3a3231305260e48ee5b7a +size 795328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2c5220b056..446cfa33b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51eb2dc668adf798cf72218d0bf08fff4040b03bc65b9ff9ecf22d7bfcb46595 -size 731744 +oid sha256:c0fcee2488a53b5cb86c111e8a6607518dde9d87f1065518d3a8f7f57b109b87 +size 719658 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5ea4e51c13..5711fa4844 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:348db9e6a0b9825beb73e122ddae346103f89d0d923df651d6f6ec348f6a7568 -size 641210 +oid sha256:336675ea17da9d5617369c0f9ce0dacd9f4dc121d230063614f6033f61d64101 +size 630504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 05a2beeba8..6af398bd99 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb563249002cf1bec280d67769c6f742dcd23939ea85b2b8819148cfe220bee4 -size 704708 +oid sha256:3830ec20854bd2e5945ca0381c91328f133e376f39d3019bedc16c2073b4b4d8 +size 693460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index d7767a1102..ec04217f3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91d785ca42abe72c1bc0ac326c1c66e716a5097b3786b1d05e0114208f70cd8b -size 614271 +oid sha256:2b2f178ece488e799b31429dd1254887ae662ba2a4423eef9c7e1cf682e016e8 +size 605735 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d617ece53a..c94d75f8f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb3f1bdc7fe0864a9142cfad6fec9892c19ad617351a37f0a5cd874c387a66ee -size 971206 +oid sha256:2f35dd43694322a033270c0bffb4beed463dbde0c0cd319ee8adc1fca4bc6a3a +size 961142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1ae5896981..e46d5e5ce4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb8f9f5d6fa9a50799b754007cff69fbe03301b75e077570681eb7a20b5bec76 -size 938200 +oid sha256:f805bb555194a2b71d525ecdbe7754a2d6fe7bc072fe3414e3fb2f29fd88934b +size 929122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 61b6c47c43..4b0a3dcfd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e42a33ca7975ccef93bcdf2b62c0a2addda815c5ed0bfa27e312e3a47efb6e8d -size 983062 +oid sha256:6189608e02d0996353b6a2c393709cb35edea7a4addfdad899d24c6fede45594 +size 975268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3d585c250e..fb83b5e56a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe99d8eaa97f529a72ee9c72d98c8ff3cd72c466dc28ce7c56034a4e3fa49bff -size 954624 +oid sha256:5b80bae6372ebc1a88d18ad26d1f66fafbb7cbc0ec853a229528fa9769b8cabb +size 941698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 79b5e490f0..8d1e4f6718 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:489bfba2a83adeae35fa06e1fe8fe8e2cba919e4583e0ec09da59d9b55167d3d -size 927292 +oid sha256:5ea147e2f3cdff1a44a0833eeff30609718e466d9192d1f2bb48da5af62e07e2 +size 915352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7236279a43..68aeedeae3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a76c69a9bf2f2a9a04cc938339ea993be645129893c54d33d99d56f45ebc2d1b -size 969212 +oid sha256:6f9389acffa5351c661c5555d9bdd5ab6fde60e45cfea23839ef8beb2b6a8ee8 +size 958704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a80008f99b..cd70c8a145 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:183db142f5949bbe3b40654dbc7912620e1920b179fdc4e28a50d7b3ccaa3787 -size 877988 +oid sha256:ca3c38314285de4ace30b16b5a234f65f7330ade3f4bbd6e7cf1828e49d8b4ef +size 867184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 6bbb202f8b..c2fb6f4ef7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3d0788074c4e10e9d846b0d68b71b8597fd179c088c98019408709223e911e1 -size 1142578 +oid sha256:7b1d043d1e012b7afb1b34b20b724c7632ecc5a738fa1e4a07eb577335ce5644 +size 1112386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 7ca8597d6d..03bc84b80a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:561dcef84bff544f70993f67e0997691a711ab5651b1bdce9bc2a0c5ae47b758 -size 940156 +oid sha256:3ab1bfc58bf56fb1d778686edec497e175ab15e150e240316219aa178e7112d6 +size 934088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f2f560b6b3..395d143e4d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1702d0041fea82ae53b37b53dcdea16ca1ccd952d9851413887dfd782efef408 -size 937834 +oid sha256:31420c9a857533283b5c877c4e6dd1319d4ab73ceac09bc257dcea963ef73a6c +size 929744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9358a98977..5d3a9571d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:063808dbae4ac1603e7e768a7801e95b2a7cb1f154e9360c16c216d71fbcaabb -size 848780 +oid sha256:e43b0950002463976b95996495efaa4ca75213d9eb36bd92d58dd51a2e397a86 +size 839950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3f5ab3ab82..15e19d60e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b625b0870858ad7cd0bcc0adf5416ac6d3ac614b02d1c192655562f8e1fe8a3 -size 1169948 +oid sha256:cdd3d5a9c93b647c7872e829f1dcbbb215fcd6636ce5233e2cacd9f5fa86c9f2 +size 1159736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 0e27934da3..a477dd5614 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7994f979a43cd27213638e6ded0e78c58c2537a54c2c73d17c8f9ff9f20a4ea9 -size 1059636 +oid sha256:3f71b337544ddfb1f1af91771cfdf92057783dd51a3c09d4f34e30d9d3c1587b +size 1047402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 73fcd3e1b7..c845a312e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb9ee85d50c657a305091636f073591a7f4d16a8b1d1bfd040b8c8ee649a9679 -size 1115384 +oid sha256:fccca3320d4375022bd4e1a05ea9495e237ed53ec8b5eb8603f3ee935e268f78 +size 1104678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 92c83de612..8c6d4949b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4499832e07702d150a041dc376662eaa16914e5e17de1b18b337bb3b9ad90d5 -size 1008574 +oid sha256:82c2b3c1f77ea4fb44e9ebb05e3314a9f48f64b971e7f44de7b7807a03e48c8a +size 997722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index acab765524..524573ee3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f47167977341b3f472cc0cbacb2234eab2f04702dfc9a2f674806e3610d2d517 -size 848418 +oid sha256:e40600667e8e81e2dddd8286366de7067e045f971742f1066be6448a78df701f +size 837022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 52ec22c07f..2fe8a82d57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d73fbc1d63cc0c8537e4c64f563fa0c16210bf2c319edc4b74eac2fea23c07e4 -size 807716 +oid sha256:b8bae6c1c433cf6e57ddddfe2af0f9b20c15f403f93e03d46cca517993a6d6ef +size 796812 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index b3e029966e..26a0358332 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e20be028bfbefb5685f7de91d6b60e3a284e35f30d4cb4ec66a7bb1b97a04653 -size 804262 +oid sha256:f67e4b27bda183cf24b2b3be368989b178a7b3ededf1fa0fc3957a7c1cea4091 +size 793458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 8398e7e321..5aef7422ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44461bbd8179119a915e98e650822658a271808a5d2ea68f11bc4f545cc7983f -size 765386 +oid sha256:7199271701a5472b6886b98e085f1aeefe4bbd2faca20553dc801184e93f1867 +size 755124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 5a8c150b92..59a0ab7a2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5deae5cf0240242e243d802f93968d2af31d0e6785f686bbea9dcbfd99104d7d -size 960042 +oid sha256:fbfb13140f51cb75888db7d5da678e34e211c2d8f495f1f45ad2a70236080c47 +size 954714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index ef2329604b..403262c723 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8bbefb0523a275413f521d74d0107039b3110a422563a0f3c39178ddf4c39e9 -size 983154 +oid sha256:684da244d62d016f46740a5510f7ce67c752a68ca8b63f3776d0be98e890be1f +size 977926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cc1e1afcec..ce66bd3970 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9993d43322399fad8b11b947182b79ad9d3384ee407631e5c60d307091dcbfd5 -size 1145916 +oid sha256:3091fa9333a6101795005c5dd90e24aadfc7a32687e480198220ff5116e083ff +size 1133040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 6cd8b668a4..e8e73a02f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f728c8a252603ebc3a3eca34789094ef7d7b337424058f7ed59030e02adc3d5 -size 1035606 +oid sha256:134d09eb2832966a1e707e6dd905f86c69dbce0a70c12521e1aa8c7843523e41 +size 1020706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3690525537..966939c11a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bce8ba5088e32612597433f080e155491dbe8e76133f99f72be136ff47f50d0 -size 1101514 +oid sha256:5dc9df36c208f0511dbae4a71c44cd12fd0cf9d867273347991164850366a3b5 +size 1086962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index ac2083eb82..01aef10bb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3caea8f5d97074403fcd08f2d7b3dc2b247381d767a9517092defd9dcd5314a -size 994706 +oid sha256:83a5f9e331b458edf215a4d0f9e81929421eea3640dbcbc1dbfcf37d1b1787e6 +size 980744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index f27adbd093..c56586d3da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:672f99c6b886b3b23c2989ea45e921d8d43c4410fd4c5dc6aef2b31fbb570725 -size 822758 +oid sha256:4bfdaba351351788c6cfefb09f4cfeb50fd4378f610659b2fdb5f934afa419f0 +size 809388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 14498bf5be..bd811e3cea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7c0dbd1802089d795235cec572c0f762e576810d2159307a843ac110363959a -size 782056 +oid sha256:4d0d4c730840730f6df407eafe705ec6fd7a4e075fb837c34bce5647f94c645e +size 769230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9de359c21a..7036dcdfeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5535261bc7e4297aad170d57d11bf7b4d404d01af8a1b0fb0401d1a7058fd8e2 -size 788026 +oid sha256:7d3e27862c8a9b9240ab6a9f53d89899483a726edfc5c2505d725ee2b3d86abd +size 775100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 685ed1fd2c..c5ae4a0396 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad00997d34081674001e4a42746e3bb8f0ff376a1f3437bfbd071eaa5170177b -size 749938 +oid sha256:bceeeb711c7e9e7b4b2db4447d7ff9ac8d2fa62915a3570b9f5401ee27db9ad1 +size 736716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index eef2aa8c4f..2153a12f23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4989aa413e25e32657ee1e68d258576c250896fc92667d8099126b647a0698bf -size 1162972 +oid sha256:11436756303fce331e96a51ecd6c862ceb62634664005c1f563d66024b09a657 +size 1146938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8b493b57a7..1ae39ce544 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a65720a79923a3873a0ad7b9b2417d21bc247a3ab43c4ae460f8f5323c41ddc -size 1064790 +oid sha256:1ab6fd5a769c453fb0a76fcdf5f061bfcaf6bffcb08f5f8eca7b04ce02d157da +size 1054874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 25e3c61bde..6ed4242ed6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f782d3f120541463a6c3a7babca55f314d18bb2b88c2c01e67c0171f22dba7b6 -size 1054042 +oid sha256:97d5036ccd75331d09bdb1de86f43833415fb1519137c3a79fc32eb991f90a95 +size 1039784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 363d339aff..d21c07984c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f88a4e1821aad704f5bac8f01748f404546dc79d4cd4ed44e5e1a33a7b54ff42 -size 954332 +oid sha256:df9ba8899998871387d05b3934e7ba0ee7793f90b0fda86253cb9e2f10482cac +size 943084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 0fe7a61b49..d49c184ff1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2fb994f0cf6f62bafc62008f45c8407c262899861fa17411b07c23554845755 -size 950838 +oid sha256:224d8fda11d31ea1e0c618d4e4fefe965850ac7ad6ed9f89e4ee4d581872d2ee +size 940084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 5ab988fcbc..b725c066a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4001be6db981dd2df5e2f83e0b88b29684388dc27dac98c57a491211d138f455 -size 925276 +oid sha256:5970409a696d0c41bdaa2df831aa83fdce8afa73bde4c41bc0486d04c1d4d10d +size 916198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 79e854d3c0..21d50a0478 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:412034acf41793b8cea233726776526ea5f5799a86fd7e6f0e8d4677c0cc7fc2 -size 1106088 +oid sha256:75b9bc733a5499172308733781177ad337231e8c8cf6f709d5c24b0d7f569eb7 +size 1093360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6f2a7f55c4..c7f912ea3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6534a905edb53ad65a3ffcdcb8daaed6203daf656b6445e52d99fc1969c97bd -size 1015652 +oid sha256:e9574000f8ca156c211e3b518f8256951057728d9de7323217d273e14b2cd4d7 +size 1007660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 8825f20519..e98895b888 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d20708098892872b975272d4eb89d965ff975dd7c1d0a84eb77785b167b6fe01 -size 1001846 +oid sha256:6e6af6b6a3546342d0cfaac51a728e96169f923ee26a529877f48b7a5969286c +size 990006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 0542ee833f..2ddb7d9755 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf179e5f426d0682eac159e9689473bddd41f2afa0f0ef3173606475c16aa8d8 -size 909634 +oid sha256:6fc4290dae91c5246864997fea5aefa1c5a2ac203152953b0a1b3ad8f9e0e438 +size 900014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f4ed50fb03..67a5dda2f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b95df5bfd19f69ab52f9454ccb63563362f017262784abb19a48533013db1f6 -size 835768 +oid sha256:a964d0ccdb6dbee450f9824886ec1a81be4167cad576a91a2221f78164a94810 +size 817366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index d0b45c282a..345937daae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c32aca986e2d450ed1fb69ab52b5983c01417519bc5aaf974a94bff1a0a99ece -size 741286 +oid sha256:0f193d56c60f65ece83f4a2202e6703fcef147c4e6fc7464441b9db60a1d5a8e +size 729694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 75ef73a3c8..08ef7c0576 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d4bf31ed74c5fe556618bb5c25ba9cbd4d909a54484dfe0cf8353449ad894d0 -size 797236 +oid sha256:739efda24ca0476c31f1197eb7470f1b402d0ff847d2486e9fd9ca75573d2a0f +size 779132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 682b9f2672..a7bd62a6a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfc674b17a0bc51b258829030ee0ee6bf974e672e2cb481d1cebfb10feb15b11 -size 701028 +oid sha256:360d48e42db4fcaad03cd9afdb84d5bb9a4d4e0e14846c3543702026be8ca38e +size 690422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 34dfabb0f2..a94e5b5e07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36e877cb748c0a6c42ee25786fa634abe296c5765b764639ec24ce42e6c81eb3 -size 982386 +oid sha256:3db20d2707d8666eeb1506f092cf54704fb3b4506b25bd07c71a4a6e206c0973 +size 971830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index 12671efc45..afda5c0f7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0fdc51ed734f3c0fb4b443f73610ebdf639dc2bc390bf370a4828ad91770a16 -size 947598 +oid sha256:6a4a15b9641b384cd4a0277f07b59553b5b011399ed1f31a6a294e249c301a54 +size 936894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c7cbf4c2b9..cca3f9cc6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebc501cc21f9839866b4c0451aabd20b81e7e9e5fd773c49e53d71d94aa16c82 -size 790576 +oid sha256:c9417150229e73e9c044c08e77b2db50709dcae0fe7644124a85af5469c2ee6f +size 775432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index f618208754..be6afda179 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:625ab87dd7b943e688829cf263e78ad3bc4e49b4ecb443228e1e2bf76d9e39aa -size 705074 +oid sha256:fbfd1a27e1912875b7243ed86e7bebb2157ee13d516fd1fa69e16e03bba7bc21 +size 694862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 0ffbdee7b3..722398a9c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f3b73f7d17cfcfb5e51dad4cfb5c8f76cd520c2ce2181e2119166cf89b0079b -size 756338 +oid sha256:ae1c5d505bbc9eaa1711297bf62e1df05f09506265ee6966d569af13a5706520 +size 741094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 1cd8843097..131695affb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30cdeb48a0630f8b283bd5fe46c17bff17943b86d44d777205775b412f23750c -size 667480 +oid sha256:764517529e4cd68cef984c55e1c02b6220dabd1057202312ad86f43ea905a94b +size 658254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5683c8eb51..554ef5dbd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bb5f9db28aa0e11fcfc71396b8bde7168439340543f50ab5796152fb267e0c8 -size 1103694 +oid sha256:406d3d98d00c50f58f0aabbd0c5de7b40499880b394b0bb3d95f2ff86ec89781 +size 1093482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9ae2b6240b..86f7e1d274 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70b36e276b4084a258dc237aec30fab3328a7cb85cef59a91e87b2bc50a1e8f9 -size 1061314 +oid sha256:90e3de413c8c95c514618d080960b80c5ec3fefb17c83213668de87b6036e013 +size 1052484 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 59183be5f9..7d74cc9c44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9fadac21aed072aeac1121a6c6f79276b37addc8efde246f4265ae40a9de079 -size 1078972 +oid sha256:9b5ded8012b944e4c2492f1af15cc1b5c926e75c39b8fbea9f053a83c9d6183e +size 1066638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5a757b7fa1..bec43e694a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3081dcb35030573cb09322114dd3721c1d398d0a2eca19731a2eedfa69c0f496 -size 1046606 +oid sha256:6f78021bf34f3930dd3823f2c9f4337a9f4ce60df3803e09544f35d836512f1d +size 1034914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 69fdf41bae..d1f6250361 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9ddfecb344d43a2ba51260d2323b3ec403b65a913507fab9a59f32ee5900c40 -size 1102588 +oid sha256:e6260a8c40092801fb8d81bdd0a4b8bd782337a357508e7a2211e8b83e5ed40c +size 1090946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7c0344abc4..74669ecbae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de5bba1a86377f76a39700ed5816ea1d547f54e6a1284650c90cf0cb5552cace -size 1003124 +oid sha256:699a5001bab24632f23e05920122b95d9e9878a5a835df08da79d1b8e3410c69 +size 993060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a98aed2315..e036624a28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f9a5f85cca38f700fda64ac3f06315169f3db9f9c2a1764f90d46d3610faf7d -size 1065488 +oid sha256:425a4da6fc0e81deb6a0846de0ae81fabf7a1112f8385074768e73e8e0807e39 +size 1057496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 78b595c0d1..6243389a8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9f6ecea31030debb6f97270b04eba7dbc1a2a6d1f8e62861ec895163cd93af6 -size 968836 +oid sha256:56ea1fb7e225fc795b7e4b0385e871a29fa637cdf235bb0e09105de56863355a +size 960054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 079df4c92b..b28563e991 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:667fc2343ecf7817e700da66324f72cc4960e38af4539d3eb4667bb2bf4526e6 -size 691906 +oid sha256:10a64d00b779bb202eee47fd75481c37575cc4d49a3bfa502288128ac61a2718 +size 666302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 06acf495d2..8f313cb918 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3897608d31bd8e25ee1981cd3940aba454afc3d6f64f68d856558517ced928d0 -size 608177 +oid sha256:0cc6e98dbcd7de26641ff9ac838fb859dc1735e09004d467d2e15a9b8e8b50d5 +size 585435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 029867100c..9186a54a97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f240a1a83d5a4c87ea766c57fff6e77ce895cd09cc595d27240378e4a9c21268 -size 716842 +oid sha256:34b19463c71c6742c9f841418cba8c5fec5ae34af9e3f791861793461d3ed1ec +size 688970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2fd434e2bf..cd90a55511 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a70209cb83078a17ffdcfab6b2c53d4ae2a3ea06d862a5c0f1b189b6b2ab18e0 -size 631636 +oid sha256:a4ac2a68faa5dab7c5ec615edbdc2d0f8215f84cb286726a3150d0e50b59e1b3 +size 605785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index c0347acf52..d72661b218 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cf81f5a4e205538a6284468c1f75df75d7293f96790ded992e8786a1776a102 -size 697626 +oid sha256:ae4484a99da2843596ac89f44a0a358d02f39b29dda7e01c993a30c7f4664d76 +size 668124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index cb8ada7f4c..ae9b6517a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70104cf1c08ca53673c436a09cd7f4c2319d1346dd2236a17a6fcc733905eed7 -size 609655 +oid sha256:515af47f6d1d1dc8aefd306efa63826755686289f53c4d76758a1346e567719e +size 585335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 2400c511bb..2047e0ee99 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6caa2464e55e06f1c8bc1805dc4de2a55f1dc24015b31d507c721a5db02bd919 -size 715508 +oid sha256:e6479ac52c43488a0c568a2ae8e4e95ed36f6c2d02e8075a749a35a06e7f39c0 +size 688228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 0749638f58..c734e325a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e04f532f5274b471abf11c6a33032ea0a5813b62bf3180f260d707e8961edc61 -size 631880 +oid sha256:69fe2d1d142025d47183560732351587fc3cccc11f8ad785d2ab1b3fef64cc4a +size 607065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 49c76f6611..433c82c312 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:528d65d16daaa662e91c942331def9a452e0ad57ac2391400e0c9f1da2fddea2 -size 761054 +oid sha256:0437d35956793a46b358cce6d2dd428bfca91fea23ade0e128ed71710f28ea81 +size 734906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6a57088861..54d6e6ac1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d0699643431902f059eb161c5a2f0a543e00db7ffd1d3a8261cff79acb501d2 -size 678954 +oid sha256:708ea0b752e9355b008b27b7e85979be6ef85ad8011500f248363e6f8253ba07 +size 654436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 733ccfa6b6..788d84782c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00dc120e7c632b5a24d006038f937f3dc796347ec406fdb332599677514d3b3f -size 785004 +oid sha256:6691c16f20fcfbcdb35cef4590ff693be38eb116778b156f030efa3093759f09 +size 757920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 1d6be46628..350291c928 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ba138f02b1749849c1d3afe376d664fda71d5d9769f39bd0379b873c9c3cddd -size 703102 +oid sha256:0747e2e273798840db5cbebce14701137a1550364ed6598924c8f3e6ef84b77d +size 678190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index dbab95f221..0de71c529d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d9468c4b5068dc56d7703830ad48bdb05672f00cc23e630ca64df064c8bfb04 -size 831800 +oid sha256:28774d463e3aacd548603797b7c40061cd913578419d8159cf959d6f014f6a91 +size 821982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ce6c159840..6128a0f790 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1b57980e2e8dde4c67244951d8bbe46d79df94a49ce7d8e7b46f296eee58647 -size 668306 +oid sha256:9f5639db5703298fe19e94bc247f2b101e763851e320c2edcf50b45eebadca79 +size 657750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 1cba64dc96..51f3245dfd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd84f11506facc2668ae03e7ba12bf9b9c4f868ac1adf771054acc22bff1b1cb -size 737274 +oid sha256:40d50ca5b57cfb33a42131e97b8e94cbec600249a9a5e3bb73eac98add75fa00 +size 726718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6d721c9674..0f446890f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb79dc70eb96c35c97a0456429b636a14ad816a5a83b953838651df3e8a7f7de -size 737768 +oid sha256:a8fa48ea49c7c8689185a4e81bd77b60be0936f4e3d90b73acad88ce6aa21206 +size 729628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3e554f240b..856900ce3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b02861b3c16aefefee1d8ca4f76285029d8a3d18e35520ac6fb56b572a98e75b -size 623362 +oid sha256:d68e6908cede583914b017d8e3c113fa4a842ca2161d732da3d03b9d2118fcfb +size 613297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..769f6d48d9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a504fa4d546fdabf65a7a3c85961e61f45f287cea8485a664f77c913c816e74a +size 846328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7d7ee4bee4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0b0b9e251de3120ff96b45e312308fb0906bc2665cfe8693f5cb0a83f71c70f +size 583279 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..13ce020eb6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26a983b6429f12027bded43a4ea27ee9d1d1e258c996ebc3c011ff80cdc06cbc +size 642776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0df4c16186 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:800d96bf158b4ca6be336fd8b1c2b9eedf09854c9f31c12dda938b0da48e840b +size 752986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9b23edc207 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:984ce0b38745b5e1cbba59777601bf76afe438aec09fb48b75460fbbb4de1dbf +size 543811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d583b35465..a5a16bf6ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6b4a8ee030ef9230748c1f002fc984884176df88b97491cdaf5a142bc6444d6 -size 793856 +oid sha256:a7db51910171ac21446635db84eb446e76124156f70b2b3a5d3ee9ab8d79b9f3 +size 766230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6f48bfe098..97042cec9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:088dc744f8bdd495f811e74f6c67846031ab7f68f30f83fbacb83b174d9d7951 -size 657200 +oid sha256:89ea9fe68c4a6f67a78cee9f361e5161e1c8a791d7f3a6b9be592ae1f5b55b13 +size 643930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index d39c4d3413..be7bdf2e10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eeda59e755de5ea701f10d4dc7e9c8702c9a64056aa652368e5cce78f6948b34 -size 715512 +oid sha256:ee68b4a833a0fbf4d4ce9c35a6b0af959cc7b68988fc0815c5fa62d38288cfa9 +size 701502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 118202f5d3..849b3cda72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f563560d94957a95eca6fd6827c631c4803892ec66ffbc794c493cdf22edb6c -size 680584 +oid sha256:74d0aa0e58cca6eb6e5e2830efb52524b17972ee03634581ef39d701995fd11d +size 668844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 57f6f26717..d50adcd28d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e80f8c32a7ddfb2fa4d6832428104425cf3d73c9a472873d9d6db7bc5a87b76b -size 616645 +oid sha256:c82efdaf5f2065c882697455a97bd63f8184afc1c91327e29d583dea8fcf5631 +size 605003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1f5da0843c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e2367805b3d5cc00d4ec0349914c2b3ee45146df9dee716e6eadb08315baa22 +size 789736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2b54ec2335 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2418cb214736e024f6a07e95b8b38f31cb3dc0a224d0c167b2c660a1ee9948c +size 566401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ba2a7b895d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2181d94f533f0d0f770ba3241f8633b185c36c6ee957f1d133a1747428b2a1 +size 618252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5442a889bb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde42ebf196cefc1b65e96741213d141f63fadd0c41f45870290f40f152d361c +size 691216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..78c97ecff5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a164a08e3d3eca117697cfc271f917838a16f22b4fb2d5bc836d77e45202ee21 +size 535121 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a6c9395c23..7c16a1960d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2fa09c6697365022036f139b89a80412f7dafe88e7c9cf20850a938daaa49f8 -size 776366 +oid sha256:f4cedb10461fb885f8eb43ac577c2405d09777c42b062858d91deeabb2e5fbfb +size 747456 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index e9a07964f8..50e1f5ffbc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fcda4f1da8f4d405e3c3ff83edb30c42ba1f9a761b67aded8c0bb91a8e4fec8 -size 782248 +oid sha256:b8cb1ede7163e406b63bda9d5d2143415395fc7face6fbcb3136af837784e672 +size 768632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c2086cc6d0..809475f9ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20c77d16786fc5c24badfb8ff628fb2f060e16f7672b0ca1ed11d89fd6ba8a0b -size 688644 +oid sha256:1a8f8354755bb4f9d2f4b55b303fcc042472598a2fa9a6035f921b52f67425b6 +size 663434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index aebf681b38..776a151bee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efbb0a2aabe3597727a42483060f8295ca259b6678c9affb81339e5648d042f6 -size 695660 +oid sha256:0ecacba7f913cc72f0e578d807bfb2ac7fa35fc8940afff2dfa8874808f9864e +size 679084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b0fad51c30..76162db878 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d19c84a7b052c6efa4342903410733ff3651d8a28b365e7c727a8dba888f2ad9 -size 681310 +oid sha256:7b6cedb41a433520fa9f136ffe276e9d78a860b4680cddc2dad7d43429d9ef7f +size 667892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fcc0233eed..b5b591c8c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9d00e2d85bada1e976ed7b04f5fd0d411d415b01141392abba44e9009ec0787 -size 578985 +oid sha256:b6fe71097d914f38e08cb5a1f8d7f413df3739dcf35a8151a179e88a02e0d0f9 +size 571781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 1d06463659..4a92e97c62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d188b926cb91c390cd3bd0138b2e2e25df9a246a2e0f8636dc7f832ae52d0ef2 -size 762810 +oid sha256:635909247b3c3fae68170ee6cde597e3acdcd7838ef49bf3d2d25e4f4730a72c +size 748058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 51d183011e..314faf3bf7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b18f29aa8d6f4b7cd915ddc5f1d87f4f168f569116387f626cbf4abc7eb52d36 -size 659942 +oid sha256:ea7b2bbdf36d7aa475212f04e24cb9198713ca75c1c1f9edd179ae7bd05e0187 +size 649878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 9d83bdcb1e..645af93607 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc12300e3d8ca0bf9159d46550237fa4fe23a28657b3a596afa4c31ab5848025 -size 679930 +oid sha256:39ec0aab98b2ad663a96c36b11030a13a36f023c3ca0cd69ef652a4f7fbd79b2 +size 667990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 70d6c54081..08a7aa7141 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85cb69394686a9d0b446cc675f789aaba1e12d2f2168c45701f4ca4be57de5db -size 591269 +oid sha256:2692e9dfdacd73838b6c69a999430b06b0573e688815bf7fb30bc4bee5e86976 +size 583721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d2f92b0146..f9e533d3bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e4006b7340f12b7a586d629e9f47468cd72bf5f2bd5846e678de2a633c1ad9d -size 621122 +oid sha256:44786ef4104d1d9b9176bda1151ab197e19058c5d4c55b75820fee3e2d386391 +size 621024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ae7730f88d..d090bf819b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:505ff51dc803e964681fca8f8c0215af401eb70f5d57be9a28a9c571fa2d4279 -size 536507 +oid sha256:df5dc6a560916baf71686eb003d414a7f0c7f9ea50da9b04c4b5109eabcfa20c +size 530389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 732531bd86..d27397c998 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12db16768f65c4abb1fb152b4f8df3adb2f541f9ed02c6b1e663c07c5ecd6edc -size 801156 +oid sha256:65c3d2b4aba88cb1be0b99ce71d62b1b1aaacbf08b3af3b88fcf079058223bb7 +size 770766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7978488b24 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fdddb81f2481d5c7b58ae8c9cdb7e5ed1c81bbddcd5722dfd5acf41ed37992e +size 794112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index aa2a0ba984..f2b576c064 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d5a15a0083151003bc2e7f942b9e85edb92b711a9892f4889d9531a2573db84 -size 710522 +oid sha256:8cd98997e3db81e6b88bf910f1119b6f148fe57369ec877469b95760ff2f3754 +size 685362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..023e1cb1c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a2cc39bc9eee7b0fec2a7895fd56aec5e2f252f6b2749e68870590e34123578 +size 704318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dd955e7ed0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e74972853a00080b4fc64477803de0684475ce6e84891e367cb3709439c96575 +size 605409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..aa77f22a15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4552eff98b4904367ca7ba94ba51585087e28bde5923b67e56db1706a79358bd +size 500913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ee14c73e0d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e102232b3aee8abc18d7c69e7c764951c53f49df1681dcb74b1293e5962ef65a +size 679558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4d7a74051d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f3e0f3e219d72a149033c4bcacb721f70e4c8cdab94ee5d204511bbf4b7bf3b +size 567761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..50979fa0f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad436c4faae664f567554b623c9ebd76192b1e419d09816e4b1b0b6b166cdf59 +size 692138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3eb4b13918 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c3a2ae6c1271ee60cb860400b30fff813c81e9aaf3df8ed2737f374f5a5f7e +size 607079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..988f49e4e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f44843d6ba808c967dcde20464bef95e0c67d6e94f9140dd58eb026c4d76cc8 +size 558935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4fea4ee556 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73e5a15caead821ada19180a60cb5ffe483e687623c7dab9b1c945d158360a36 +size 464997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ce794ccb05..80cca7da7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef1a0cabfa409c89611eeebbb9c84cbf77ba79ac6896c3efb8d668687b22114c -size 826472 +oid sha256:e628cf775c25b77fb231f5d5c309f9277189d7b4b65c10480402f1f504aaa852 +size 819022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 881243e198..698ead6bda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16b7db8b616cd759ad1b57f75364354d2a5543ad6845f19df87c628f78eea600 -size 852790 +oid sha256:a2ef0fe405702dbb879c43f59a1240299dfd42f4ec249e8bb50c347139ee7f8e +size 843466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 675d8954f5..ba25193aad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:468a829d17d8f191e5b40dcfb520515e8456b7812faf3e2a5da87bd81c302b4d -size 791192 +oid sha256:602c9b86f5e5accf6c3e61648a1b8e02deba501ef0b97f7bcf06cade0f1a12bb +size 767610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1662117b58..6c0a24a4c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ccabd41e1172fb9c7402f5ef3c8895df6459ee8c1d63bdb99b5c9ba7b11b8f0 -size 816672 +oid sha256:dc871160e3bec05782bf75035309da5e672ae305b92ee2ca7cf78961bc2336e2 +size 792054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 9a15faf25c..96ae20c8b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b02dfb0ad923411c00f14877b371e2d265925732454bc1ed5f3d4e80360b1725 -size 763404 +oid sha256:afd43a572f752dc5aac49c04f7545cc9891fd23f912e83ca020cb080b9f80963 +size 747320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 782c6e3f39..a05487b43e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2e1bc4944ef20be283a64de954692c1a164d18d6b4141e6b947758c55b25910 -size 679628 +oid sha256:3b0fd8a8024177b08694b5522a32d4bde64a1a1043a46f3ab39045809ee3e5a5 +size 663742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 23a77a6bc6..b2db135a28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef55377a367f2b1774f58327ee8305dca73d79002346079985f0540e8d5b141a -size 792288 +oid sha256:8594beffe7f30e899a689f6e017f080070ad2b5e52d7380f8c47bc5b8b9792e5 +size 773738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index bf8ea3fb43..5b8c9d88c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1b4b1b9996acecfeca85e14963e293ad755521c0f6453e62763db40d9675e29 -size 705996 +oid sha256:6cd21681525d356b23cf7c3ab22d03390625de80ba1441bad986f65073c04ae4 +size 689074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 0fe4daecec..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ca45215b78a7a4c0abe33a97cfeedddfcf6fb9601c20cd3146244deb1524fae -size 581995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index ad476384ce..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:53c7361963f7d82b4b12f53acc276cac9954af6512201a516329d3be47e7943c -size 543117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 09ffa6576b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:98ef90e6acf9882dd504d4964f9eb5f93894e12438e95fc45e42b66f9d1ef18f -size 567929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 1e88e21d9a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0b588a00c5cba994307fc23c96f51d7de0ce9ac0b307e208186a92d5fd82d807 -size 535119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2d8d3579d5..607a5ee84a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2923d2f9a87e48990c7c806429bedcfb4d1d201b07834378f982424e12855bf1 -size 776364 +oid sha256:1e54f7ee50ac3468917d8878c3b523e46f0df62f92145016ab83e46edd765fed +size 746764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index fb8e55a40f..461bb992fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05ee9077797758517f772f72c56154bfc295661be5874cee205a6272f7d1d935 -size 689874 +oid sha256:59ba230bbd07746c5c6960307e9203079e132ba81a52b6ccc621819c23c305ad +size 664962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 2fd04272c1..3f98ffaf35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f689f8febede036701fc4771c290dfe6c290a3ee8873efaa5e930b73f86247db -size 797650 +oid sha256:e14438a0b2889f49119fa5c193f69f405858799bac5ca24d266d6eedceca9c16 +size 769234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 3d76c078af..3ee5ab5176 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f19f61b6921850b91340fd907d91b9c2a851e839943ed3b965966d9cdd17e15 -size 712888 +oid sha256:f080e3fe79b3d17c4503d49542f2b559abd589868fdbfd96964b581b36c2e9bb +size 685902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 162a0b44f1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cf65dd48162134934aaa25a29006e8667f46c26359ca3545bf29c59c8a75a7a9 -size 589769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 8d06968ed8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0486d986844b2daa658590766c4bb79993939726ba1a30b326b334387a330a9f -size 486161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 5d216b305a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d5e68590f2e57072e8e97bcaa7de5a8ae313311c9b150e5ed3c321a7bad1765 -size 559673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 2b43a60709..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c81c5382e00752062feabfc074330aa5ac2e8c365d00fc2c0fcfb789884e716 -size 459125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 354d7b124f..68b9eb4a74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfd2f6e5af3e97951aed7e3dd5c6fd99a765576d82c5828e9ab40e595b7c6d5a -size 882004 +oid sha256:18104cc006e0243ea97d00fd181e16059f4ec2acbb7e34db5cb103afe21aeb70 +size 873024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4cb5152603..9576b5d5fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:324220a303e94760cf9a08ddba9579588c51108179969916c11b5f955442b26d -size 697938 +oid sha256:0af3125f37b7984fbb5c14cdd454a5fd77c6341a2aadc27a030acf9e9b74ad6d +size 687924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 8d082e2d0d..2f94e23d86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f92b595640d9c2b7023dec8db00e0f460310e23b165ee8297ad99d709eddade -size 772678 +oid sha256:fc13be7b11d4a05d6a570b33e5c85af6f5ecf4bcd3322e16aeabb4a1aeddf131 +size 761924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c2f5b15e4d..aeb72c0ea6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46b6e1453d1443962916580f95fb194aad182d31a82755f28a7547ff8a53d67d -size 778154 +oid sha256:7f9163d904626f896c699fc57d703c04e6d10296fb9d9515326b5a079316841d +size 769916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 24b3f5e644..798e75dd4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:722693db2da062f05e7db98dc4d3a839c255ec1b200becc39725bf8583578d97 -size 650034 +oid sha256:fbbd26b2d0395d1eb117e99d6165f0a1648bc2420175ad697f0949256132b9fd +size 641500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..898faa381b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:964766f98e64332dfb58cdc18a5c5699f274340b245cbf79b47ffb44409adb1d +size 897814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 00cb5c27af..dc590ebd1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:679ae4e69fd21ec683a306dc38acdf13409ab8d2d3bbcc47a5d912a7c64ba5e4 -size 607435 +oid sha256:14d4f44e5e30b8cfa04db29b4d9c751f9e2b28f6684a45d83cc75176f2602d6d +size 611135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..41c9cafad0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:805588bb4044bda31e61ac18a60df97e9f9f436961a5d7b6e240d03401e66324 +size 675418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0f63fa3cb6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc1552891656ccbdebeae8869e88238d249c39bd2bbf213104a73178f381595e +size 792436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index d9883f247c..a2da5ac924 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25220c4eb9ecb892b7b63fe4d68c194b713ccad1c664ef0d60050b4a19fa4811 -size 568559 +oid sha256:abbca75c129ddbcc470c4f0519442f3488cd7c2330cbc531256d045a7bce42de +size 570383 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2698be53ed..e85ed2ce49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17b093053f63dcc94afe9a1394877b557b3915a85cab3b5e7be00ffc0cef80bc -size 846774 +oid sha256:dd9bee9f206414a111f4162e2d04511b4218683f0c3c684a9f7e4b8d066f9169 +size 820330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d5cf9cb8e6..2f716a8e24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16f13a50b9eaff1b2c9f54aa276ed2514505fcdb344bb85dbb2943a3af48369c -size 686092 +oid sha256:4110924e0ff7b2035b8d0eab10c635bf5b84eeee7e9cb401a865000348c41b5d +size 674154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index f95173e0ae..e95d988839 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c88d8050884a666f4574d4ac8dc43fef98af626ac51a10d460b2aa691aaec61a -size 750128 +oid sha256:2144be7dd4239f670c36cde0fe5241c8a54bec2c0cbe0677ab7952062145ea10 +size 737448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index b9ce4d1876..ba5d5936d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3564879c0247b5a162303f4460359cb335926eb36a48d1dbd77b5534e0733125 -size 719392 +oid sha256:11d6fa0ce48436532eb2f09b20f6531a882f2a82be5a3ece220ef7623c89b709 +size 709032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 131ca02fe7..0082bc4732 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17d1a8ce3aeecd941ac45a9459fbf26e40659b277f4236db48d663caaa30cc8a -size 644108 +oid sha256:fd475efd9ba1eeb80656489515fcfb7f0b1a69b85142e8c826b50785f7a1d1cc +size 634044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d585a052b5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fff7e89edd67bd371ae6861173176daafcb39616411181812960007c559e3ee +size 845120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 99a6d25229..e36d1c36aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d8a69f650caeb4c6a36c1f45d684f41d3a638dd56a75a97cace4b76c87741cc -size 593369 +oid sha256:cb604e5d515a332d0abbd1314b3109bcf932492eae9dbce5873b3bb663e2c320 +size 595047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..98b954a708 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71c101a9fc7cce30bf9a7c16e14761193e30984b55c708eef28ca2291d610ceb +size 650892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1e48a00f0f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37369af2ffb5526aac4c500e32f2a0b6feb24417108f67fb8f02be799d5b638c +size 732194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index f934dbc9c8..fb74ab2b6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:425acc7097ac31f8763c27ac48fe9cb0720c0b3ed7d1b2d45907b133dcaa8288 -size 559771 +oid sha256:dbb3c95399e46cdd664ab678046d9c0581080b12cb38164d4c3cb94b877fd23b +size 561695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 36b78ab85d..43c3af32d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f4563bea81b2d332b580debde6bac0d0932fb43b48ec6a6506fe573ff11939e -size 847536 +oid sha256:c96fd32e19ca385989857e4492ca04d91a6bff8d96c1ce5a88c7dbaf8f4a6363 +size 818332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 490687a75f..022e10e4fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:798cd835dfc82e252a815a57a940ce9cadbe1141bf090e783ad7d6621f572363 -size 831368 +oid sha256:ad699f44e9ba17bd126a8655acb32eac23f1d314d969f102136fc7a9cf4bd5c1 +size 812768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7b186e594c..dd586e06d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7a4482c9f02df303653d7b65a7c613df612dce736f41a3dd03393827dea55f9 -size 757940 +oid sha256:e90fdb1c8ca4401ddabd3b818fa16dc019d9d0f8df022ea30b91b0eaa0735ef7 +size 736972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 970df7ccb1..58698b6f94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4de7ac156122f5c7351082e66549f1c2971405aae9a9bd46c86ca570b9aa92b6 -size 742312 +oid sha256:30ba14d110fa2d27f772a99709b4e5d567e5eab339ed98fe3853848bacb1d880 +size 724552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e399ed943d..e7cf9e4985 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86726021a351b6ea4281ad65853a8e3144ac24c4e1b9d635c5c2ca4bb02ecddc -size 709018 +oid sha256:4835bebe8275decf8fd259163d70819ce87ae9f699e901062b6df9c3da64de24 +size 695402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 82bf1626b3..f4b2bf4eda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a8b17e8fc55fcecf328b171cb1ba89aa2d753be6886712f5497607dc2df20d1 -size 606593 +oid sha256:817189560863b18f071bbe5e6c60f4b765d5f2c14231644c57bff6319fabcc1c +size 598601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index cf2e969972..024d76f435 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b81c167582f9ad065478cfbc4798cad5defca059f53196badcb4243465c90c7 -size 796190 +oid sha256:d790eacc4d819aef99ced7315d7a23ab999c31ef514210b8b344654fe040429e +size 781194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index efe1e64037..473d02b7f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80b5a0d19c28adfad974db1ca6edb8fca45d18be010b5a8d80f71957fef32db6 -size 693322 +oid sha256:30809ba769c1d1f89da46d30e7a7d5739e92f9a269531a6899a371a2d06025f0 +size 682962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index ff81585b71..e70db8eb32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26f79c519d526f0cd580bec6bc82ca55b1597d199db1eaec5e782b2543855fb6 -size 716714 +oid sha256:c7f33c1e23bed299bb2dd671804f35fc1d98e77023c416ac28408eedd6e2c583 +size 704332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 88e8b73223..804eba43ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc6c1e9ec0a761cd0a6e5c2c913c21e003d1d1dc6cf04d434ea75c0437fed71a -size 629880 +oid sha256:682d8d9f1b28844e171e6406535bd000245fbdae86c10d3c4d956e1ec07b5349 +size 621296 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f6865351d6..59e4dbbfff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3374ac6cf21bae095663958c7e1f9899c8435cfe5a67807e6b3e81b3a5e562ae -size 657020 +oid sha256:64a2f209d6e4ac5514efc924fb37f56d446e820cb09c13da1d517c8d2d954889 +size 647054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e926bd0ede..5a4b94d784 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a805fcf64a8365a457a251e176eb0b5ef837ef1f423bc006a429732260cd3f69 -size 561057 +oid sha256:55f789e58d8d09b471cc5dae4769670d19e16e88fbccf4956e4f85108119ba9e +size 554939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 9bcb95d84d..02105c9363 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb3128f78561c952f2b9a4f06a407520af9bf98e2d9067014cd704d8b64ff94c -size 871832 +oid sha256:d95d3d2924d907e8f655477905cd270d0026bb263aa086235ff0c0bfb7ef7fca +size 840012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..61711264fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33270b2eddfd5373fbd2306bd19c044f4185e46e3c535f325c4fbe77cf107ba2 +size 838248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c185871fce..28aefb0a9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42c9fb57925e7126065f22116b712fbbd4b5aa15a47a89b878ef2f6e127fa0f0 -size 781544 +oid sha256:f4f57891eac0e977fd298edbb22bb4859e3d515b50f4955c76ca31bdd5839ad4 +size 758308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d264efc7c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5819d670ec18a4212e0418555b59134bc9b5fe97fba647c7c2636c4ee81a11 +size 749588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 79a6610a85..b847695c24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec08791257d8e49dbd2242fcea5ec7bd75c40a0946d23c1f9d886946caf057c6 -size 614421 +oid sha256:000885daf7aba5eb8c5051c32cdb1adb7acac17332ccd1667387eda980cba3b2 +size 634894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 64edd8bd9d..4733a45a73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57ce4d700d638fbaed5ca844b5219c291f4cf5647ba5af66cb18d47c23eb3ce5 -size 509875 +oid sha256:bc6ff89ec9e807c9347bee8e582f74817a0b359998c119201c7e45052f8aaead +size 528621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9f67f6ad66 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be1fbef43a4aef18b3eccf22194a8c0c727101b00f8a7cd9764b5c08c581c199 +size 714568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e40c5d37aa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0905d6bf1dc9abf6443daebe3adbefdb582521d113c8bc3d1b4c7f27c14d7ed +size 601733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..619f0d58c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13ba63067646e79c949a152aa8506252b3c7fb5fc4c188c7e71ebe0c102cc4e6 +size 727690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9538496b07 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78ed4cff46fe4e91bdfa55507a0c558237252e7ec1eb284368af2bdefde506d9 +size 644654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d8f4424fcb..9333d1a646 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6f35633d5c2619d45818637333c97d2b844b841bf566d24aa2bd412c316e31f -size 582351 +oid sha256:2f582d28888efd60ffe0dd58f7003a82c76c55341ec26ae2ac4148eeebdb812a +size 592465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 08a0d5bb15..8aa53db720 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:506d1894447ec85ad223ccba1365a70ab2d565286886c34f4dc95b48e098ea19 -size 481259 +oid sha256:8b3bc6df3c2a3966ca3ab872c3514ef018ab6c2ab95d17821670014ae3c2480e +size 491423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d93a20dd26..23f5131478 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd09e38cba8c24fb1241bf0e2beb8cd24835cf8f470f4057e2fe502b50f5eaab -size 666696 +oid sha256:ede66c2406e460cf21d0c4a6666a9a9f24eb2f18c93bed347a255292c8104114 +size 653524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5f0529fb19..1a5023bacc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19f834611977f06972162749f4c64f9da373d0fa12a1bb012c0efd3d576171a8 -size 574237 +oid sha256:f7284c4b54a8b014c5e8620db31f67cf9ae73a4faa920208e21157eb26012833 +size 568809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 23d68e4826..f63e52d592 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eeb297c0dcb28362294f63b2ba5af45e3289a8bef5a2baeeb90023a707ee5ed7 -size 682112 +oid sha256:b9e3e81d6089b8107d5b67cc32d7e86ab2b058a102cfd0cfa773929e5ee049b2 +size 672344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index a6d54a3529..ba2061a89b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23494b5ec76ccfce4edb9320e5824d36f31a5f0cf3762162df2b85f9f41709f7 -size 592217 +oid sha256:dc0c81510b25ac7c4875552332024bdfd8b1c171cc522e0b42d547d825c38fbc +size 585163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index b05893d24b..6279e29fb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20daa33e521b4f2e09e2dd34e28fb7199747664314e574541fdd82d9307d2258 -size 663142 +oid sha256:d6fc90f5e3c74beecd96094b228c9076746d579630a5ebaec586f7687d92ad9b +size 654262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 908c259ed5..6a986a0349 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f28be471909a1a8ac24b3d0361c1c3c613322fd14a078cf1adcdace3fb8c843 -size 577687 +oid sha256:b1500f2a555621f86439f4889b090e562f6cd3e8f2d74aa59944be8f82cd0a60 +size 569251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 91bd5cdc3a..a5e714347d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18763120631f02986b3e2ef1590abe8e89ed612dc3d23cbc3859c473db99743c -size 680482 +oid sha256:562c644fb1dae9e3b14a01bc4838c3f8166c1eaca48217b40b3d758c9226838c +size 671602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index cb76d6197b..590406d170 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c07c5b35379391e2d40bfd5a9f90b7750866f7357c6c6cd8550e0ea2bbc9bd9 -size 594831 +oid sha256:d74cf9d6870c6b34cf7bb41cdc4ccb0225bb2db2aebeb228413ed1c9b5e51bf8 +size 586395 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c09a02c748..409cfe7906 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1dd50568269d24c00132ded9d21d0420b6d7785059ab916a8917f0f86e5a00e -size 731454 +oid sha256:140c577cf08868c4d4b727eea3c7c0390ad77d3f595f7e9d82659e3b0c94eab3 +size 723314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 55ca57f0dd..a9c5e2cf02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:381d7d24e6ad47c1249255516abf56fb210a1b827aec7c35ac012d78aef46bc7 -size 644026 +oid sha256:1188abf3b2a0186301d57a2d8237ffc8a29d896adfc1df37e9b25e5166cb0c4f +size 637910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 01107cf44c..ec758c68c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b3b966e52a1412e4067599d352ee95d1e105ce78a2b48b5771a2f1a1b7d314f -size 749582 +oid sha256:1196b75f4b9dd225c8221a7a50c9555a0308e65ce3b541bb0f11cf73638f8f38 +size 740654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 4e1c802e19..14af2e856f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e15cd3fc83a52e82c54a713730d0133dd2005d304f3e2688947fb2eafbc232e -size 662206 +oid sha256:378c670d316012e0f67fdce17b2e53715034435b4d94fbaaee24bcf8ad643e5c +size 655298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f9c1105e9c..98ad901e91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b6d48bc1a4e8073b92c83a29968d9b060fd60f9ec99b381ddc327d8d584fb10 -size 800128 +oid sha256:4e1a91218d58984e8d63a4c27780f321f17780fd4c8bf35ce64e22d756bf5a5c +size 791198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c8ce6931b1..b8df1e62db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d83abbe12184558ee387bb0bce832d406e722f3c5c415d678a1f8375fcaa178 -size 769588 +oid sha256:9ef66bfd50d878be8cfffca2f9c72a4b8a30c8ef54e30abc0b78c8af4879c941 +size 758290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index a0b78db7e3..37dd0c5f7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22204cc2f829aa97a0cdf74fd3a331b03d84e5546eeee6a4c61b018d4a137f5c -size 853800 +oid sha256:a9f874c2c9be8e2de0af12a9791982df3b15a353af82524140b3ffdf45ab68fb +size 840924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index bf1a69e0a4..1cc72e171e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:433f4b4b0edc8e864c5ccb2ecf25c7821f35309c76bcfabcf95749448c912b38 -size 728790 +oid sha256:92fc5e553569e402e6a199a42e40c73fe4eb71d57657aa55a96ce5d423dba784 +size 719662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2930c66445..fd2f2fd4dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ad0b6ac904e08cd9595eb16770fe1282663a4252e8cbbec4b20af660053b543 -size 727604 +oid sha256:871eef1f02bfd5e007230b0c2ec8745739106546e6b0a33b589608217d80963b +size 716800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fcecdb6cd5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c29914185720b4181d5f2dbb647efb618cf3f6fdb24918946825527c55767be4 +size 815642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9fc9e2cb35 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a1e8f19c1c1fd049ec7827864440cca2908a5df4294ab59e2dc9bb34a82c4ca +size 640260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..555b02000c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92eb3cfae2bb3fce5f429d8c0e166744ce0aec8a4253f7dc8b4250a08691895a +size 705676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d876de2cad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93b9d2ebca71f09908421782b65a1952114e0c2f353e52163e146d69a43f7f3a +size 741146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b0fa87709c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb411bb2de731c5f855a16599935fa174c139a8e7e37ec833e9b25676d6182 +size 604343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d5c48b29a4..79895fc61b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c60fb6e94c5a6c98286ae8e05175c07bb5bed9b5d136d3318bf50b14ab4fc2a -size 718968 +oid sha256:bcf0f3296f84b78039f484b54d06ba7dba1441131ee8aa4cc57727f27b337026 +size 703330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0e5312de3a..d5f50c3b4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4c8bac67b3ed63b5099e3ff3930a52c45100707276305527c7e9bd322686487 -size 752808 +oid sha256:01564d9ed5dc44aa7d986ef2162783a6b977f1e3bcafa2ec3355d474233d83b2 +size 738848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index a0f33be7c7..dbd87c6b13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61af73283c1848cf28c716327df582923c1f8c4b36d36c1553563e171ddacf90 -size 823848 +oid sha256:6edf1a73a143bd443dde31893dd9ef63072ad6d3ba7576e58c17deba38e6679c +size 808308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 8934529fe8..5414f7790d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98618cf96c8beb005b1ab818673ebc34c9e03e0a5a8992239f26b03a45159d92 -size 661592 +oid sha256:ae96490c1250063d4a7ff9db5d76aa8ef7c2fe190f3fb64a386970460e877023 +size 646594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fcc02344ca..26c6361502 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aefe1aeb4774e826ebc0135ce890bb84ab9284ef6c7d30b918bae5cf806a8045 -size 717286 +oid sha256:10897c3438f5e910358bb65630ab6d7327d58f2f95c13d242419e90c8a06520b +size 702832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3442db593b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07beeedc60a734bc07cd49b226f3349a6889c32f812b170b70f2741a82fc0c5a +size 729006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d748f4ec0c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a68ca216871af25945886a84f7d8208d384aa2c242bc6a3976d2f9ddfa196474 +size 619386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..60359d2391 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60fa24dba3357e5149ca211ca5ca364a86388809ca06ee4134095797a4b13d54 +size 675478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..abe86a6003 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dbbd0c7190d8040f5c0197fd94cea781b9f59a8756769245bee40aa857cea1d +size 668868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1643788c87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b10972adfd2a6be69ad36914eed398d1dfc398b0c0607c0016bd47826225aa9 +size 589733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 801a7358fe..4a7e834347 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54462dcb64a93bf6176c9290379b1ba80bc040e897f8ad591b8b3704da38a026 -size 754216 +oid sha256:087aad3f01b5be2c472a2ae17e9bb9af93298434e9f1126a5fefbadf51f4d607 +size 742030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 74bdbc571b..734397ba68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be1219ebbbf1af07622543c2f042b1ab5391444376196578caa589a46618f772 -size 719496 +oid sha256:9cb63a523802daba7e0302830e97019a6bbb9c5e6e1d2ff2087a0649fdf587d6 +size 709136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 45a41869d5..04151b3500 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ec1d9336be3e9c257c9d66f402436a70e6cb8ba20d334ccc729c1fa3a6e4f5a -size 663680 +oid sha256:d3d5a310bbd50a963c3c12eb76ebf4ac26dc00320d465c03e6a8f0311b0fbae6 +size 656872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 76636ce029..451f4f01cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aabb3e2de5fdc44c59da3d3873650a007cb95579a869fca4ab4d51e7512b0dca -size 630640 +oid sha256:a5b74ee980835be4de9ccaeeac841a0b09cd7277ea246e1a50c3b9876cb918ed +size 623832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index eeb54ef656..2e7752890c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4477d058de9aeafd50879e24f2eb341fc40bd176876cb7cb98f8daeb969b0840 -size 778202 +oid sha256:87268002103dffe11149e6c81483f2010d87834f93294fcbfd60bc18e3c92549 +size 764980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 49e0f5d019..019b5012aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8856da891ffc2bbc08585eeaebc9ebf8e38fc93655b30fc49cc1d0566dfca23 -size 673508 +oid sha256:6e151d32d80c952fb2feab3eac0baac9274fce4f6d95666c6fe630352eb33c77 +size 665614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 0026176827..0656295159 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:751bf495045033547d58e78e5f559e008236580c600d9815ed6de4260560151b -size 868284 +oid sha256:966697385f26107e528355215196d2de42a09dde14337015285b73ecee4f48d9 +size 854570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 128f9d740b..ec2d2337f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fde4792f7f3e698ca68152872512eb071492a3b75fd501cbc55384c9ccd9c4a1 -size 765268 +oid sha256:976698ad570f024fd487c383b1c0d42833acd45b0ecebce2d37732ce776ebd2c +size 755006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 590093bdc2..60de58318c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d045b4fe1a11cb138934f5b27912c00ee67d4b01911eac156ea466448f6c545 -size 665622 +oid sha256:659c9eaca1ed6ff507bbe66acdc07f903aff0c722603c78f32ad603f381964ac +size 654622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index dad0875746..b2398ad913 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2021f10d79b75632ab5b92126c4ea4a90d9c59f6932865a68663c4d843806d2 -size 574051 +oid sha256:0e24588912ea811d1a742a419d658ba296b9e80319c66afe9708343034dae118 +size 566651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index fc8443743d..03aa071ab7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:869575de197002d6df290b19fae76bd7eba6f1e43c0e33ce660591ba2c72d406 -size 715792 +oid sha256:d5feaf72327eff0d45676511c17f027f91417f373a418f5fbf581140b4101a99 +size 714708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4abf71f410..69d26b277f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe45117cc2e4259ac984ce32c5c6d1ae0835dfcf486bac793c8b39d91e9d470f -size 626688 +oid sha256:d0355d81bacc34dec41d76cf606fb2ba25714108e9c5102e0e018ce96b5bb3a0 +size 620572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4303a5c816..939f535183 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7b9bfd248f739dcde36f683ce29592a080586ccfb3008329901d83da55af09d -size 770766 +oid sha256:4cea4ad2f6ec80d416c23e00d1c2fa29e4fe097cc86b6dd364f0df6761a8876e +size 758284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7870146e8a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6bbc28c5f6460b3a8afcc5cd77fd185df670cc76453d2b7e62adaca2c574484 +size 732940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 5801ca8b9a..a676eb03da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ba879ffaacf04fd439295af3bdeb38b1385bc078a377bcfc719be6f5dce636a -size 681564 +oid sha256:2e375e362c73d41eb9b5044a26d9a710a74d968d885cb470f16907cc5ad24ca0 +size 673176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2a665b448c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47ff806548aa5b7b82b69fb56c88db80408961f5a805d97220264ec35f8c4c58 +size 647732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6ce9460352 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d77e18f3033046e77d72d3d85d4999a090fcbdcc7cb6c4453a7adfd2c497e95d +size 658346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fea4baefee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a28d9612d3cfb33c3c11e542f35d2726f4727ad0142060a593a6491690717dc9 +size 552713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5b408806fb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:521491891df8e657d017e0e4b95ba25c0c286379b294374f4f864d1018b18150 +size 733974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7cb363d5d1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a142c67aa5b1c9caa11bfcb650e2e9e38e449ec5bc4fa7f015f6b0fa1c2631 +size 624988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ed4fa06aee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cb74159cdbf1f56deaa0b826cf7046866ecaed22f1fc3433cee389ddc251076 +size 679214 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a2959345f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17d00ef9addeca5b0591f1dab34baceb515d7a9fedd935574ae44d11ec6bdd9c +size 588085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2968d40dea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:114f3e809c744fe63af6c0a6ebdaa8552e2a56432e053c8a9765f72427ee4290 +size 607923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..11ef09d822 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f119c8392267b29d50801b87a42097d0e82948a8f43e9b33caaa316a0961eda +size 513985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 1f1769ed98..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:408125096f655e20cd296f4ec8780c3a32ac9e36973761ec22a4ddf45b6fc45d -size 639420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 3f837604cf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b4140bf3191a2f60d888a6fc026f181984596fe76ddf2603d5d9cd7cc6720577 -size 605475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index fb6ef4eedf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:77fa54325600b2d6021b129b084c4df731ff9cd1d7f936ae9d8171126c854fd5 -size 621900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index e04c8fbfab..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d208f4efcd39b327d3d0c964e580ec7ddb2dfa203037540b4afdd72b57ee4aca -size 593579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 19b864d35d..fea25672d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87efee2ec409647e45186314c8f0cfec1163a397f660a17b86af2ca6ae1aad1e -size 753522 +oid sha256:b6a13b0344b96e21619e706a2b7afbc25482921e90857b1584e9f99233bc3da9 +size 741190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index b6ad674dbb..baeb900b69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57e0fffe3a24693f7434168aa1ec628b8f31151867527b8858ca56103f527f46 -size 667872 +oid sha256:3d952f4bd8648ffea43e3ae9bc70421fbbe74104e4465b2f957735fb0dcfae2c +size 657118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index fdc9a19665..dbd316f4de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96ff3a35a9f6825ec9f4a165a868046e8b26b93aba52b6baeda35653516e97ff -size 774760 +oid sha256:5c4466bdcdccdd8490cd851eca6bc4aedc0dbac806f1c7c876b94e6101c49244 +size 759220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c261651dbb..addf22ed33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e70ba77bc9bae3c29bf99e8e1c6137d5d06484f0ece21dd02a1be3920519f65 -size 685804 +oid sha256:dfd50d4ef0badedf8978bba34689ea74e2714d4b02f8bea0d8aa79c2d90c2c9a +size 674358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index bf495ad335..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d82f841cb84a698675ea405df238c958cced8a81e2b30dc2a5e7e7b89207681c -size 644382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index ec2cee71fb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9cb70747f75c7784a4c475fcd89fa7f35ed105588f8a4e2e9a930420ceb14403 -size 540131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 4ecf6358b3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e47333cd4492a6c3f70d207772b7cd4d98fe9129c4443fc19a74233545306a9a -size 610241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 09df958255..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcdac8d1618914e78affd44d25af8f11cd42a64946fdbe41681f7493155534e8 -size 509593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 52ffd65829..f3db03a7a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dcdfe946b0a29c460452e1dcee94ac3ea5f8eb1b231655372380e8fcd4bd801 -size 852700 +oid sha256:a433a67c6f3b3dcaa6abe39b298c59c555355222516de0fac24474e687b101a1 +size 840860 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 233bd3b995..cbf4a8e470 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cec9a7cdead29aa66e57e0214f13296300d206ca465f503d53050877e94b96d -size 798974 +oid sha256:22d9b23e288c5385e7d46567bf0b6cbd1a94ae068744b3e7d972a26d91c78e75 +size 787232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 89752028ba..7c345777e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88ee80edd4cc3f70adff1f244049eed63fbc9fc27771d9ed9a61b621bba2760b -size 886540 +oid sha256:1072df12d4a5c877309ad35b3843409a9d542375f3a75d0b0b41b3928fe9afee +size 874552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 52703423e0..8fc4abb460 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:791a4a76c6e4c5acf3bd16c0f1f1f24a30e911cb694a51ef7e009c19309564dc -size 767646 +oid sha256:d501f8dcfb307d243e7725f824bd13fff2c6837867b5f527fedd521d3126c430 +size 754672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0ac54ba61d..45a43f8db5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbab42ac7e8047082d2c2f0c08cf83e9ed1d9f75c9acc249b903965df1a57d48 -size 753782 +oid sha256:bc97dd2acf31936e606805603cae5a4730e3289ecc2d99088e54baf9f014f0eb +size 742780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..34ba245475 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e3558a42d3652fd938bb8686c7902c8fde917214c7a22eb580e520772621037 +size 864712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 20af45ca5e..e93d108f18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78d0ada682f8ea70e3d6e16031551839ffb8c962cbbbf93e0ee6534a0bf169e6 -size 665698 +oid sha256:1b41ebd544d0faf1803b8b1467fd6eff2aa26cba056135c271cbb57b9fcee18f +size 668066 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..71bac3ae8c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a3f6f394941db5dc2218e9c02012935c3c0b552bb1320a23838fb19ae4fc29 +size 739106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6b6a745b82 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e457a5e5f937fe88967c9278f6d44ab148e6e5a6ee620c0b00e2726977cdac +size 776946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 590b1f1d20..2f3929a6e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:446942541b9df58c5f247a122be7accc8bf9840ca403e9fde600734080fdf2c6 -size 630176 +oid sha256:42e0653214a4fd559321fd36ed7d37568a3f83376261e312d0c18389e2d97484 +size 630868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2ddb34393b..7b7a3ba8c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a4165c7fda01e91dd788300671622dd7859d88c926987ab1b3801ae1a986a26 -size 769468 +oid sha256:46112639ff0d3cde3df54fd511785a2eb6a31b850266c728db01320424a4eff5 +size 753780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e8912ce85d..0e7cea9506 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89e5bcd04c80f0c6d85daccc06018f7f059fa124dfeaffd33110e07c7462f977 -size 781404 +oid sha256:bde16e91121b25447806afd37de3a390bf6d0e57804b37397ee4c3efe511c7b1 +size 767788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 779eff4504..049e7d3fc2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef0d20678c25e6bf4a6c5b1d7ba04f4f82af83b42c3c0a13013ecc3741fe2580 -size 855602 +oid sha256:3bcc21e5c6c69f8225dae3d02ee65a75fcc052c36260034f8ec08eafacc17264 +size 840950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 95e2e9da6f..33efa64f9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03a4a9354d81fa8e5afdbe6c29f07ac2906ef1ae5fefd2ac04c9a27c4ec5e0a6 -size 698278 +oid sha256:cbd47ee42d7e5929531011692724d4d5b3c87589d633b029ed5ee5492980a1b8 +size 682392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3a61c907c4..371d653bc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e21ff06d005cf612d8617742e68ad351c951c8f95e54cf19319c1be5fd8157fa -size 743514 +oid sha256:9abf287eca042571b110fc3ea1017eb5aff580c9ac879e722d9c2987b072d11e +size 729602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9c263b4944 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36861bedb782cb91862ff3d70e86e23c404fff164c665cafd29480de9d4be7e5 +size 779654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index d8df7051f4..7b73f2d609 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfd0d45b3509f503ba1fd66f14b02708fa039b1fcf152c5bd5d833bc0ab7ea04 -size 647390 +oid sha256:3a0a72d36d57c0465eb1fd748a118e3654f8d9a8d50c5e65260a028ef8cb7ca2 +size 648820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..701adca069 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b731aa4fd4d6b7b6fdc79a13af7c3dca3e2dfb2a54e946cc706e7a27888666e2 +size 708120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0fd89249f0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d619bb8530010caeb9ba494bf7899cd961a8f17b65948f338f0cde17b86d3bec +size 704616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 81301eed60..8a5c24dfb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f1456211b3dc9d103e77fd3986a7857d4ec4ae7940e2655fbd3991b54ef11a5 -size 618478 +oid sha256:9f2eff02806fb9785a7a00b6be32a76d13415a72914899faddba370e102849ff +size 616307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 341e88ca9c..e9b89409f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:787ae64540e234dd8387ab2798b347d74928e0be4544fd9f6a226a969fb93b23 -size 821488 +oid sha256:9b154a349acc222a34f9e506c04f379827e3697b21b88e9af83bded93fef483c +size 808958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index ae8abd8fdc..07193be3b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f107c49168d30297cbfba4d3515707dce9f54cccdaf74f25165338f86e0213ea -size 766790 +oid sha256:5e18f453a2cb8e697e111156493f386b7308c0480538beb3bef6dd077e5fdec2 +size 757022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3c2b55e8b3..59246ff8d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b58898b781badacfc592ab7fab8ab8672f73c25dde187fe675c24da436a95be -size 734456 +oid sha256:3c16eafcd17b830a7099ce4dd3c9a440ec17e692638d40aa703122f789098e4d +size 728092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f7d1b0f303..5b1d82a619 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aaa68952849f771bbd2e6293c825eda1b417faa241a79ad90dbdd613d89acd6f -size 680942 +oid sha256:9436c75e5ae8c01ba4f2e87198e400796c03caeaf10f1c52afe0bb7dc8861b7b +size 673344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0c32ed06ab..85e780e6b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:214ffe8bc3d1c61ae85e7f562b1bcf77142cf63f024f69729e8bf768fc3d2d80 -size 806748 +oid sha256:dd694c118f6603cacff8db359ff274ae5f6651d2d5bf32cdbac0881aaa4c17b8 +size 792490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7215f7be72..10d39fb370 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf90e4272ef080fb68938f29ad422c52f84a59184df385129e9e8584fea87dab -size 700822 +oid sha256:c8f1c38cefa4265d2668eea27668bf4efdba44adfd88bea219fe76dceb707ecb +size 691202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 4ff37272b4..f72e6318e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4c2c807d9845339f80128d2be790ab1c76d2591d1e01a30f8a0c8cab5ddb054 -size 904280 +oid sha256:93503e8f4dc9e36ac36edf96d63f6a4e6d60094bf7fbdcbefddd69132d6bb3a4 +size 884498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 2b0a4be072..782a464d7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faef5c7c6a6a306c1bee515079146da5ad466a53180e3c550527dc77fc5d8009 -size 798846 +oid sha256:b4510a562378a250a16a3bade61a52ee11bb95ac25e51a9afc8f1aaf16e7909e +size 787994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 5f3ba634d6..4bd0f062f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a804cd605741d5947d2328fe0db084c1cc18c2af8067dfbeac953cce9ebcc9e9 -size 699744 +oid sha256:8d93fe3d18892ffb375fdec537dc63161f8c90cea50bb386d1c6be649b6bbe7c +size 687410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index ef3c9ffb5c..1a5a461a76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43fbca14dbf095bd6f29678be6252d6fccc6d19bff5f7be9c53f564e4189590b -size 610639 +oid sha256:926612df7a5d2b941ec70a09acec1b5a5148daa22d5d1650380a58df0db442a8 +size 602795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index bfb6d975df..3284cc701d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c65f8eaf81437985d17dcf24effc3db61e7e05835467563d5ec00ec5007fc9f8 -size 752824 +oid sha256:53bb262206352e528942f0bc0a0011b55370202c307b0b84aeffa39169f9eff1 +size 740738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 849f170ce2..99ca9f78f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95ad70188e48a2ba92d5e980756c37170964dee9b5f222b604a0f8fdcb563c58 -size 651584 +oid sha256:01f6d3baf88cfa7356af2915fa1c269ae7cb7eb734faa5f7b1f771dd4ed9971f +size 644826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 99040edbd1..1101cfba2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d35bdb6f7f0ecb28d8fb79990b75668ad6973db9b41d70fbd4d3940d151b7f23 -size 841640 +oid sha256:450a7fb122dda1eaf8d432ff90c54f0147f619dbffcf9df33fcb093ca5c45e48 +size 826200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c91e514630 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d90d3601bdf934f5365cb011a96871eb5fca0616e23b72ae9942791e68e2308f +size 780232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 6b18982657..745a25a0a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25951f23d58c5a8e4aa7fbf4d101ba59324d146752694363e625b6fa9808a0ab -size 752784 +oid sha256:14866d2ea16ff2b82d2099278a2e90984be78b2b303ccf46213482ae43c2408f +size 744544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c8db8cb8ff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76a84a0a00b9122ebd45c616b7e2992366f6876a8793ccc8b65fa8c8c89bd250 +size 697148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6f97bad20f..035e4c8e69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e2a278306acf675fb7cabb2af595b408fb02876956a0589545a944037a7c6ac -size 668638 +oid sha256:5e79f50ed8f9ba45eb5e547b67f65a1843fc414f8707d6b25b0ad96c159a880c +size 687830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5ebd3b757d..a6c7921d65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:515ffd5b3d34c5b6d3526725e82441868e67669449fab313c1674b18693891a2 -size 561675 +oid sha256:0ebfa43c92eeb7e67e703e717649319c7f89138e2f2a46be66b13a1953304b6d +size 581309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c2fd5c6fc9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d37f59e7ca7aed13e96d48ff026c18c90921ae4143553b11becf9ad1a67bae4 +size 768046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..28dad32979 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b92cbd7dc42891aced80278ffa3993df50f15e96aa39c1c12e677b4619eefc5 +size 659948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9820c18e1b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85c71b97790cbefb18bb0a44e21c3d78cb0b455533f33f768035202e71fe8858 +size 711016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..23c90f7225 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:166e44667cb13862344e9d27c698ca42721614cb55f4b265d6b78e5dd30622ed +size 624230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ef56d9aa8b..28993b0b4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a558447663fce97d29ae4e205b69e51fe194385cf8a07879b590d4bf41fa06b2 -size 632524 +oid sha256:e95a9c66fa50eb0a8378bf2582a6674c67610def634bbddbfdf94882b3e5e0d8 +size 643576 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index aec4225e92..d2b0603dd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f993295d002074be6bd798ea5c9ce9ad6e0208ade085071b8c4b7ffede013a03 -size 529557 +oid sha256:50dfe2e56fb7a63901b33cfa7ff609b169bcd6dd2df47f3e5c154be986971fa9 +size 541101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 59488b0c90..bff6e02963 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f09122a80a3da1a7d5922c062509786ac20f185b2109d1619b8053d7630fd0b -size 652190 +oid sha256:fdeaea34ae0692f6185bf016f0cb2babb4fd533fd1531fa51b120aa051d40265 +size 643754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6460f9638a..e90ac4282a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e02f10a3c6ab3c033f11a414126fc378b975f03791b71c6880346cfc46bebf52 -size 569351 +oid sha256:4eb3650f6e89d83a67655ff72f73bdc33325a086346d2037fee04ac6da653f0b +size 562691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 88800e286c..ccab1dc600 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc6325e59eff3b27ef0f626958bf311939940cdbd8337bd60ca3e5e4677abad6 -size 678804 +oid sha256:d0a379720b34ec9cbf9a4b06c91fcc0df344623ecd353387759ddbcbb0bcaf16 +size 668050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 3f03805283..bdcbe43d1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:079e2fb8715b572115fc54e761dc4eb428d737dd7508f7f7fb51a1ea4faa3b03 -size 593399 +oid sha256:2c4b763a7146d271c19f41440893e6e5e7f5e9e61d52995f468315a446de96af +size 586049 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index b1cd0edc67..5121f522a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4bc124505c7f81f747d18204f3ee808ef9699fcf88b5953a7ceb3453440c885 -size 655394 +oid sha256:d46ea49eecd879d0c2b5443ff9a50b80f4869c519981bd3b2ea62e418affde41 +size 641878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 51c0c73cfe..e23fa2d888 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c36b15cdfef1b482340b845adeffbb109efbffd1fc3b337eabcb19fd3256f646 -size 569941 +oid sha256:e20adcab8959f0783dfba270fa205a70333e7d9407a2bbc18af4c99e8358b731 +size 564069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 08dee17763..3b27f2f849 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5bccde9b75ae69537490d6e9661ba2bf99fdfb6a42664b68f3632b6b3300974 -size 676582 +oid sha256:2d905c966e137e8c776477b1d93ac270b6c3bfabb946e4aca1f4f9c1c2373e62 +size 666470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index eb0d3d4650..7ea0bdb20b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef5b58a5477713546e5f4fee52ef68db6cb5df0b3fdf68633355f5467651a1b1 -size 594533 +oid sha256:cff47c9c10d39382743f894c4a6520375a9fe2ef055ac108e4aceb77a37c4c0c +size 586639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d2daf8511c..1a9e33bfb7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3849eba650fc9cd98265ad85a54eef05c6df33991bb49caf3690140216c95988 -size 721338 +oid sha256:0b9087def028a1837773f2162bea9e345f9d91fa11acffb257ef4d48c149973a +size 712310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a96f44a2ec..4c19986dd9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:468a8bbceb14315630dee4a939c926ed348ffd56bc50d54e75179cd0c1bbc5e3 -size 638450 +oid sha256:a99870c91e1e8e6da4c91385e0dc1efc5c2c83c6d20183eca0e60ec6db96eaea +size 631592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8048432dc9..f68bce771c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb8c88771ccd7eaaea38618d936aafb93145eb2078c65e3c6d614d7646a76333 -size 747804 +oid sha256:245434a44902f253c31582475bc1af960276d9aa5de4dfab7c77f3835e6937ef +size 737000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 3c69c51480..b0ace50d8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91c818dcc24ffc133780ff4bab2a2bcaa667c292d581bddbf9c6335a0d48e71e -size 665460 +oid sha256:5e5829cff5ef255c156ea53a490c9d0d4fbe745ba736f21aa9e960dde4a24af3 +size 657468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 21fd4de756..2e4925aa1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4f5c6efd25db4f09f2cea4e449a32258e822a2bff00be8ba13833d25b3d948b -size 776938 +oid sha256:69b3082eeea8f1d406933a81fe401d2be465701b539d215ad60760dc08dd913e +size 763914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5732c6bdc8..6c0707436a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb6628680b487ee3ec2b8afe9051474bfd1419a6b4e8303fed2684ab163b0fac -size 647880 +oid sha256:bd36de0dee3023889c4f07b98159d520eca2b0bf9960124433ab273e4b2966d9 +size 638162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 290209e4ce..de9a1f383b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6e5dd8e338d3bb1cf27f4ee88d4e95b96a862699833ed324e4c51f073f24d79 -size 712656 +oid sha256:0dab42367d90cc861049ff81d3dc5a1c3940062c8711490ecd7c1a24fd9340c0 +size 699532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 7a8c6696fd..3cb8a719a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49f5678898ab8c99fe1e0dcc1bfd237c373844805cf54dd8c2b0183b9ac6237b +oid sha256:e6116b080e8fd02b3ff306fc3001c8f9d8deaa04e4672772dae6588be5542c35 size 704072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 91a5e585e4..b296152c94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c7989ffeedba9afc544a7c4572a284cbd693e5a63a5139b27a08a85b757a41a -size 623162 +oid sha256:dd6c833ea01174223ff7f56f352da1b9b1249f8cc2419526d5518747c48c09ae +size 612309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..74b718b35b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e0a3c490da469d0a7b92b8d3ede2691026b0d092aab19d3d0940b53a542e099 +size 789148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..95d4a7d9bb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a106d23cdcde58b4eb5552d965bf5539c107fc95b55f048d02476ee728b8f03 +size 570599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..740db71bed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7873fc80b4900a27419530b657cbdd4d4bbe53adc50d3b3292c0295cfdd8dad +size 627974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..228a562cff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2c1ae82c778a18b88b6c6eee8469ccb72a6f77bfe6357df361dd229fd96406b +size 727380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4e08448b7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:360309b27586b126633b7a46d4918c03f191afc52a314e18205ac84d058f6dba +size 547755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 54928f88b6..2a53f89cda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b19bfd3b4b569c168b48d4d53d560d2337224404235afa02a6e689366306c72e -size 714772 +oid sha256:21159389dd99cc3a833235f7d291d9b7cc1489461530c2938ae13e1689c609e8 +size 703870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 86ccf7d59e..8ac3caa449 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78693af90e4047a22286cf8ffd06f42929149247ea7502a3ebb19f0e2a822f06 -size 639686 +oid sha256:7f6452415a210b70929cc02dbe77db1dbaed8591c7631f30501cdcba29cd4489 +size 629918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 66d0ef86f8..2aa77d4c6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92380f4c5bc978238c5e0dcf3e87564314100762037d36b09115228b4aa60690 -size 700760 +oid sha256:be0c6db5810eaa57784c7312b5b4c4b85ec61317fa13ccffc7063d5193e1fc26 +size 685762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f2222a9b78..e52a974836 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c29ffafd5488d190b2d49cf73db54cc2087cd9da2fed562450f14fdc7a1928b -size 663562 +oid sha256:75fe161a71befd2ddcc9b4447f38ea2f01f1d5fbb52833d2cd5a3e9979237737 +size 650982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 145beccd9e..2cd2f3991a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1542eb8e54800f2344cf06127d63c47f22a23d299e50c9e829e46d1cc53e857d -size 615657 +oid sha256:78e84ce30206602e0203f18cc9b7af1a1d8d3ede059dae9bc71419b124f02742 +size 604063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bdb628ab6f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25d14afcba9d89a313a971918684885c770c364257f8df9ae7fbf8c7e9255b7e +size 729892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..aae6ff55c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be3ad388707a88c077ef99f934515a85baffc7f6b3ac67094b6a67246a813ee2 +size 561909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7a38dd60b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beb06b420fae9a2362237f7079b8067bd57c553fd7490177d779106e941b20fb +size 610849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7f00c5dea4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e288f9895bab1c955b7a1edbd0b65729fe36d6f64c7787a5356f924584385d6b +size 673502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..adf8707098 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15ad664433d6dc22563b2f9d7fa9efc0d2af4e984abbe4fe42bdb2e081d843a9 +size 539115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 091ce0ed2f..1eeff0f7e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85471eb6479b03816020ee2815008bae244db550a2c77c6ef1bfb7ccd50acf69 -size 737292 +oid sha256:bde4a85c2e35486374309a885adb878a0edc7ced7f724912dbf522232933864a +size 720420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index f8ed0027f3..5c86e90374 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6d01de05904d61fb441c67ad9ec576bd370b702497b44a2d71f5ab39c0eff22 -size 727536 +oid sha256:7a56adf8b6d1d034b031bde18cdc0dfa23478008420e79f0228b4235d849e8bc +size 717126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6d1a5766a6..551596fc4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c374f67f0f929d49a849b03aa4d6ed51752c236a43606c286f38d15776a7122 -size 649126 +oid sha256:754e70d502dde65f9bc8c5ff197860c49317d2f184a8ad96c2a7b138f2743140 +size 641578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 06c786b631..40bc72a28c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c04a788f230919cfaed4d6d642ca9ddb8a8fda3699c44d2415be3e5bcf03e19d -size 632462 +oid sha256:352d51980dd8764de2097568e5b5ea25d66b16258e999a0edc887489bcaa1dcd +size 624964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7239b36cf3..4e1db5f2dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e676980cdc766629db5877dc7232041456ea8e2a30ce289bcd008d5841286f7 -size 666016 +oid sha256:0edb28dc25f43953e32fcb696bc0632dfa8399b96ac4859141d5fb21144277da +size 655014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c9dac6f7c4..8d813d3708 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b8bf2703943d88747e501185c75790e8648a0d9058aeffce03ae8b4c08e0060 -size 569707 +oid sha256:ffac33334c2d05fe325b5e2b409e7e51edff1c5e9bbab31af078cbddf0447bd0 +size 562455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index e0b7dba5f9..7e76d80c53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1dfdcae7d576af517049dd34e1f775da44c194e79cd5e6f34fcbb97f59d6cf53 -size 740360 +oid sha256:98f80f7aee32f2251d9c3c999cd7d4a2511f04b676196d3b9acf3cae2f5fc12f +size 726152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index aafdadf398..d75684cf49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:553907f6c51da4f3627f8a41045f5176ab9eed9ff1431b8efea2723ea1726b73 -size 646520 +oid sha256:dd78cca4c1ac14462a7bbf81dca8c6a7cb196d3dc52d0c3a14849992794168d9 +size 629204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 9dc57f0280..b4192833b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdc1f3a30bf1b0ef47c2cef502f11f780c5ad98379243baadbafee3a32033a68 -size 663746 +oid sha256:3452527fba725301f4310ad1618b0cd1244c01920324cdf707afc72cc5ddb9c2 +size 652596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 7ec26539b6..647d93bd2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6a8ed2f5fc92f5a952cc86e0a96fde783de8b67202589ed2e887e974882fd7a -size 573555 +oid sha256:d35ac436fe3b0129f9863a59667841b8f5fef75354d1a5a3b7c4150e2e34d84d +size 558115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d620f15e41..cc5e3eb35c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bf41d9b1e6cc262613346033571648062386dc5b56c3cf2633c94805f23335b -size 623192 +oid sha256:cdb2c306ed134b43e4ab84bb529e6d6bbe267578ffab9dd1ef1167c908551bc8 +size 622354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8651d921e4..ed407fdbc4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d629a702dcc788d3e02d3323bcd65a0a5d3e4d15207eec98fc0d0e3ecc4a66c2 -size 536997 +oid sha256:15f52c7476808aff07c8c64c4e230deef47d5af6c40055cf3f1a045555470f98 +size 531029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 66702bfca9..1bee0fa60a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f40c3408165cdc4228c860bba86f8a60cb10d672be98efb47c7777ec34cfb850 -size 763018 +oid sha256:ddf8ac6d076295124844fd4bc576d26ec70fd6b838f56bcb91e56771251a34c8 +size 744716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..14a7e10d59 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3376c5d7b1bca5e51f31bdb0a752aba691d7f8422c7516e523afa3de5e6b7d75 +size 743346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 8bc0e02fc9..847282706b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f05a808d4e6e11dc3c9b41a7330e0645042c0101a6bbaa3991a34501166b1dfd -size 673224 +oid sha256:af2f404f892989e09090ca41508527ca07b741d16f9b9bdfdfa9e0c1eb5a9425 +size 664936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2a156a1b00 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dc570ad183113360f9deff766f7674be2cf7db65126e89adf15a266108ff8f7 +size 649556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..af8e574aed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbbccb8dedb9b59bc588315ed80b284452d59f61240ef540ed093c348ef9a01e +size 591199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c7a405e3b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e5ff84f07f9d1c795d3d0b898f1e61feb30ea28992aa9b9121023bc194ed44a +size 498347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dfab90d32b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b964a7c7fec135f2e55af946f8df9036b6d03da9f3638fd37823a36073f7369 +size 664856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..02319fd3e9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecedbacfc15faf9612f8e3aaf6cb5bf4558b4af2d6af415d6c01b97e449d67fa +size 559963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b8fa372706 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b83a9c2baacca4d77f1349162ddbba36ead7f5bfa5b0a023c9a9c070eb4c036a +size 675166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..36b6caec36 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58f340b5d5a02a34ccb800083f500478b0072b11f50d7f83307d7fd2262db8f4 +size 589317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4e8962c894 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ad0ded7c1e2ba5cddbbb54747c5720cc93519f773cc7fb31f724220a79c667 +size 563769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..31ebda61bb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:446caa25e29d20dc2756240f8480c01834cdfc4a8217e01e3584fe181569a3ed +size 468893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 61fa6690dc..76f0846405 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8b25515b1a611a381551c657eb242e5b608f8fb1b2638ef014a6afdb588a280 -size 772992 +oid sha256:1dfe159196a2f1930961e0c92cdbcc364ea204503ea5e3a7cc2d67b6595404a9 +size 760758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index f2b7ca6e3c..aef83f65bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0459fa60011bedae64bfc8cf0473b03d0eea47a7b0ea6a08134f7a3a77bda5f9 -size 799508 +oid sha256:c076b79293665a21a3304a0a02ae2ab11204071492cbf70b92bb4898b1fbfb91 +size 786780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index fb5b2bcb18..9fb5346d11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff1ccb0433b767159ee07dab96a920d1065c75cf797cf6db9235a13be1f57832 -size 711616 +oid sha256:326038e6b2fca0be9b69f0c7f09d4e0c99b358c03d46daf4bf3d104cad530d64 +size 700712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1237913be3..817e1ce37b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee824a6193f51e59dfbf85c341c247b367c9e44481db0f24a2d9c71dcb211836 -size 737934 +oid sha256:29036e09530ab9461100084581b7005cc06d252a892b3a2df20a677cdf644a03 +size 726736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index d16f9bedba..6c85e6ddc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ddb1b75a85cb8e259bb93e31b64c9340a8dab478d7057f28b54494e4139bc30 -size 707210 +oid sha256:c2b8a32f0228519c7917b286655e95909875d214ad955aef58e58470bc7b1e65 +size 700254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 85979cfaa4..fb4382e1c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea097fdb44ed6107e7d4caf4c9e96b937025621f836d31b167d98964ba2454b1 -size 618896 +oid sha256:77bcd225a5493a601dbc357bd2b154e5a09cccb79de2b770d6bbec51ef3af4b2 +size 611889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 2f22e3a6ee..043a60cdf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2aa1f26533f83de01af9c832b0508d16b4198dbe0933a4f330f90ca8a15e6f1 -size 734466 +oid sha256:3d7fe31e30dcba0aae8a87f9e26adb2cd7473cff608855a4254fbb46f7dcd6ae +size 726672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1cddcf087d..2401f7f686 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43f94ff78c995eb60e87a5e47f8f5844818c5281bd185460c8554c877daf99c9 -size 646102 +oid sha256:4ad0a4a169c6e9a3c02848b084ba9f86cff9b197d6f3f2b9ca15eeead49e590c +size 636384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index cefad60f2e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f32d4fdc53990b7c6954d6a88cf427583a9ad2c67dbfd3aee23e94084eb2a58f -size 570301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 621852bfe5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5bab55a152347e4def1aeab361e3eb2dcaa0cf61dee93d87b802c3e49623f179 -size 548691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index a04624c23b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9db5ad2ad73d3f93e0bddb33f0b979b4c50870b9c0e7e04ec7357173fa6db2f2 -size 560379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index d014912a14..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4daaf03407517f711031798944f8da8c7314364efc996218b9a2535fe584e5e1 -size 540001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6d20836e5d..5389da44fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54a182012ea6040fb9c065d7d1ade0db94cb132e499f4c1c77fedf94234ca295 -size 735070 +oid sha256:aa6be6ea3a4ae76b461847d30ad4396b1ea212434ddb377bff46166014786005 +size 719580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index aa56020c04..3eacf45b51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1e3a35cd7d0f19994da66bef680d2814999ce4c0119783808001058a0915ef6 -size 650948 +oid sha256:56da5b8326aaaede8503309aa2b3c4bf929279a8ff56e41bb9eaea70a3d07ab8 +size 642908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a7eb8b1cc1..e67178f79d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba48b662606f1a08c512774fa358ceb408f0a6e916886ce5d25a5e930762fa58 -size 761092 +oid sha256:ce0c66692e2ba261e0e4b77206dc2b0c85aaad7cd606dc094c68ee60502c289d +size 744664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index bc57167906..fb4ba15d33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16a95468d908c1d5ed8d50bfde88998cbd29a929fd0061ff9efa54e8a2f33f08 -size 674752 +oid sha256:0490b92ad99069347dc9f5b577fda3bcaed931d2ec560f98b9810ee186175a8f +size 666266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 8d0bdf30f6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5bc4f4ba42508a8782b5d3fbf94e95871502a94ff8752c2b1ca734a48d4e4a60 -size 589965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 3e379c0336..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9f77adcfbcc903eed81d54dde560f802b0a076746bd6209c1b4e349c0e4e6761 -size 486061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index b444de3c84..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f0693e30dc3d139313f7c899259925f82034a61f8775369297b4d2e933e96423 -size 565247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 0e54bb2e92..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7cc8635b4edfb0df548a2da247cf18b2281d3eff45d66ca73ef7441d1c40e71d -size 464795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 50a4b44ec3..ba6dd2bed4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0352405bd6e306af02f16ca7956a6027af28655b64f7ee0af38b65963c081436 -size 829018 +oid sha256:ac3fe70c6cc32a0e8bfae17dca173fc064102a17741fe6cb9839395b45067fbf +size 813970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4dc7ca1e79..c00c14f916 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51eef9dc08e761a8c86daf0c166ab249fffe253892aef94145163e0dc634a51a -size 677562 +oid sha256:cccac2a53ace086b84e2771a6a9a8d02e36fb866f9978b6d9886dcc97e21c1a8 +size 668386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 1415acaeaf..d70c9bb301 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b5aae6bf4684109f7325e41f356420f1efe0357ee3232faf81fa0e331b6bea9 -size 745494 +oid sha256:1e5fa1ba41e7550e33bc43b65dfd668b8bcf34a60b976e5b5bf2f1b1e3365798 +size 736564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 269b6465b5..6efd267521 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f19e14ee0db82e49c7a35f1136a535f39151e6dc83efdad7c536e0a9e711d4a -size 739180 +oid sha256:cec835131ca78f0f467508e15f49b4728ac66d0cb78fe68b640e4079bde4c267 +size 745100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f33db72ee9..2148171b64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e46f7382fc6378131e07f8b58af43b1fb742b67d48e0791d1ec88ab417c40469 -size 649638 +oid sha256:4eabe3d5b2f794328c30dbaae0f274214d80bf84213258c1c4bde6a633790cf9 +size 641152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..676b4b1baa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dc015eb5c06a49e3c74236d4f02deefe61fe49b1d87c2c39c358f0bf73699bc +size 840438 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 196bce0644..ce3512f02d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec6d3244ce1779fb328746de68d974f37965dc2a661f37867ba4fdd3f98c5805 -size 595741 +oid sha256:b36a422c7850863d5b5810dd6526cbad9186b9146b79d92a264265c5d0457616 +size 599243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..eb6c8a60e9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c24f54f755a7f2f1664eee1fe3cd312986034c809e02d0847d81c26241827e64 +size 661356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cae5d13378 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46eacef1eab5cd069ef67b8681118d48cad457f80a2b0ff759268330959c2ba1 +size 767620 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 242fcf890b..356202f882 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58aeb3a842b83be9369649800a9f907fdb9b4c1ea0ceca758901913963a59fbe -size 573391 +oid sha256:f43375d3c94a0ebedbd89948796ddd1376ee66874a05aef0422614b03e6ba444 +size 575117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a3b38734b8..23a323d5ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9938986b311b31bf6d73e538f526e46940509843f40440fb9f3470e23af2a989 -size 771784 +oid sha256:6d8a5328e109671eed912ab2d46a601ab400b04257e0ab357a8347cf301a357e +size 754716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e1cee44ea3..c118a51953 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a62e8a98d75ee2fb746ce9b5b1f905848039cc79976ea9a488381a78d780f7a -size 669366 +oid sha256:6a5c0e118befd8d7e5426febcd6f8ab8e87a5c9f034fa62262051a3b160d0ad1 +size 660142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 1c3af05f30..c1c22e0df1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ab5cfed8b2465230d75de6df79e5fa1ddec9b3faf6266b952e8cd7430c01df9 -size 734388 +oid sha256:4e1d2ecf0c4e87fc77b7e7a630fb4dc74c0c04c70423db90f2027811b688bb49 +size 721758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c6b01f7504..3d7d95048b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4a6a90a96e536b52acb8b3ebaa494eef6d41e4ede7e1353dade0a5e8485a400 -size 702420 +oid sha256:ca59607c2bba575cb6f11228e9b5dc7ee4b4315683e5c5927e8ce5eaeb472b6b +size 691222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bac1e905a2..55a5273d1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a662aed542ffb3196844e08b8f108b182a229be8c3890ebbb3b1e2d2e0b6f29 -size 642132 +oid sha256:95f8aa64190e1e25fc022d2f9a078d558e37b2d40fc7b76595e0467e006a4b5f +size 632858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..50ebc45a49 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c435580ed21367940dfe1267a08d94bdc841fef88cb2160948c6ff57948dc23 +size 781182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index eda6aac28c..e946507c95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:743e337266e22935dfeab47e6dcc153acad97c5ad80cad65e3dccffe53507b27 -size 585819 +oid sha256:9f0a2b3ba084819fc68b6aa52c7c74053b770a3a8f67a6dc8f59d2bf6cfb4246 +size 589765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..315d37436c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dbe020209989ace0128b88e156f8b8513afa90302b77e0ff83a224171dd5a20 +size 644280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..50b032a334 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91643136a7e77dd989bb01ad36348018c0a72c1e12a64b3913ed50a364d0e414 +size 714580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index f55406f2a8..7c478db15f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df7917b43bd1057890895a948c4bc26cbce2603ffcc150b49989d2110bfaf6bf -size 565443 +oid sha256:9d3fcf6515ad683f3e445baf70386ae72889698211f8be610b7d2dacc4e015a5 +size 565639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index cc789e97f3..b80cd28d56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68e180fb73b795b22c444097d7d7976622a5063829f407cf084feba09d997517 -size 808462 +oid sha256:25ba8938f16a3ca393079b71f947e4dbe86c93dd65aa37ac0777e8d31c14ff19 +size 796326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 4a2e2e4438..0cf63cfeed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6389322b02df3261c0fa7a651c5f58875978c6aacc25c6775bcbdaec867342a0 -size 776358 +oid sha256:4777a92a9c59bd8e46069e8f32992712951765d825df8aa3aa4a45c13f37d946 +size 764618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ce71b0d310..b4e84034a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cb2ecaedd38317255812ba47a33a6307d9015355126628f876381c1c25c83ba -size 719358 +oid sha256:2850a407b3dde75348c96b0595de9594d7fe6c1fe982667cedd98361b8a314c5 +size 713586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1bd53ae6cf..e4e8ddea83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55a10149b656d6a8fba608b8ced1fe1fabe94ee7033cb518fafcccceb32ad19a -size 680742 +oid sha256:f90f9896435b5e863db8a2cb40c73f188da96a07dbf31566141d5e3c778040db +size 675218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1675fe31bb..741634380d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:394665d4e71219ca1e12564c9e0315ba14c4ebcbb0841514c9b6af9a363a8f1a -size 695352 +oid sha256:e425d4208ec3b6b47b205e8ce96b19156a7c22c1e6bb632ed6e359945af25e98 +size 682476 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 28bb50ac95..a2281da11a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:914d007448e964a31c4ae92eb109e9905c3c6e4a0bc1cd661ee8d0f07161cdf0 -size 597219 +oid sha256:cc292313c1a6df6371053b99aadc89b9a90a5a341920c35e575db9d00f30bdcc +size 589079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 7f456783da..373be4338f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f277f2a272c98dcce794457d5b80803fae9de15288ef691564c48448b1a8034 -size 771966 +oid sha256:d5e2530b90b7404b688cd4b6dc5eedcfd9b14755d3c3f8f98232faec1b422691 +size 759090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index b3d0acacb5..294c221baf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:061d22f632035c37729704211e20f3fe67876613710b1320a4826bfecb56cfdc -size 677238 +oid sha256:4fc81a7d50772bba25d430317db5a33131d6d5ab32fb5734da13ca1cf0d4c50c +size 660810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 463a140a87..1ef86e1024 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f0c6e46d71e4a76ee4e4787e8e9355fde469cfc67692d0452e505441187024d -size 700482 +oid sha256:7d936b5406ec075c1465e6679a1eb131c473c397bf4924022ec4b87eeecd9f45 +size 688148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 909451dad9..139f67da0e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2aba665d4f2945ebd86b89ea7d5feef24e7c31dc8877f64808eedc168b404999 -size 612955 +oid sha256:2d5c0fa87a39f8ee026b06fce099e2d829f1b1ba3a3f99325e63e2e9af69108c +size 594061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ca11fc8b8c..3fe240fefc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13268f581f473bedb8aa6d69e0c9662a4cb77c69e519dfe7c3766f908c687507 -size 657362 +oid sha256:cdc15d06a6278a2683279caa88011b4e96990a528dc173e99065e72219148469 +size 648384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eb88c047e1..af02ce92af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43ffce1bf963afb41b7c21d718d6b6812f387173122f9dfa1c9f18b957948f92 -size 560561 +oid sha256:6b880b0f4e0b181f76b10aa6262e2cc5b87bf80509ba361151e18ba7823d2cda +size 554691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 40d108ad14..97375f7476 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcd72a319989d850221feeaa31f3e11b6ee44d879ceba5f2bd87e08d9b53375a -size 835226 +oid sha256:1e2d0823eac5fc0a50a834c873ae232e175584b9d35daa26c1ffabd34bb0b7b1 +size 820574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9ad8450cf2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55d46237304c3cee5f26c26bd1c6a24d3b7a5c62cad5d7ecc2855f18f861cf1d +size 790098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index b05c99feef..0f3df9ab25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38194f988503a0a38f77ec628bae29df6af4177c1c015803ee3a6f14d6fe25ad -size 745430 +oid sha256:7111f62a9a0566f6093952cc3b443a1ce709203ed492453aa0dc90429ac2827e +size 737340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b58a439190 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3ec24ce52d0e3a23a3233de80f8ddfe415ca34c749675abc960b175f5407fff +size 700500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 977047e29f..006a7d2b7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64bb75138a04b5627910eb79daf28f13a9b4e7a2e05a89d2e63770f03cb771be -size 614665 +oid sha256:089ced8de7a15c62bd65d94cf897a569a4604d3901a98956b1bb31b5db7510d8 +size 628726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index c611da7c4e..73d1f923af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c016c39bf6103f30ec90b32b7985350c85247a02163d84bb3606e334b7499d61 -size 509775 +oid sha256:aeffd4b4cc6cf99e11a1eeddd7b121d71db975143884f0f50f9a0f8b2f81145d +size 526843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dfb69ad2a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62e23e128e88c39b553032425379011232a96a7aa890b20fa811a5d5fdc781d5 +size 699076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..83e0ba5f88 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dc2a1186689da2df465e50ef402b736abb3d184dc642644c39e95a82aadbb8b +size 593345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..50457d7223 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:241850bcaec9b344d306440376734cdb0b7a46481504c5761b1b2537046d44e0 +size 712296 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..97d3485bc2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f007b597ff4f4b12d02f337eb499a77daaa6cb6a0ce725aecf60170ff509a0be +size 626104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5a718d2694..a761df7642 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb294e7b3d5fbf32f1c7f7029daebd53abf1f4ef8e3a58fbce6b67cb04301898 -size 587233 +oid sha256:c63ca95ff4a15499db8f3156b6af2fab1170be1b0e146455bc8a9f17236006d6 +size 597199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8d7c3e44bd..dcd5f254fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98dc15f981249a315e69c2ae094667458e74b04117df012779229dba90462b3f -size 486141 +oid sha256:d62a09f40338372f7b07e297ca081bcf18cc8ce820b59642c5d93ef29cca4253 +size 496057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 852421d9d8..c963a5a137 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2207759b0e3c12c8e3b49b2abbdc5a58c2690fc8d2c27078eff26baa81d4de88 -size 692560 +oid sha256:acf0c65d703954a33ed03910a4542873db52d959bbbfbf8ee2732abe4a38dde9 +size 680030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3a44174fdf..0e0a6b3aa6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:837ed81cd780a11a0305a46fc040ae6bc946f110749ceab8e115ab5f16a88cb9 -size 604787 +oid sha256:0dee0a2e3b50e6012826132f059ba63c1c3a8cb69b1bb3d911791ff4c43ef86c +size 597535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index ffe4a79826..cb6809206a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8c985f5c03522504b1c529c6c3bfc0767691b16b814c67c89b12f9cf6d99711 -size 717498 +oid sha256:26510b752c809a7ead4cfe4f6f0279778bcca21ab61770dd39b0d58b5886abd4 +size 705904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 408d416dc2..d38f88384f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcb8385ec4adf699e726510eafe4123dca6b2e4f3befdf2506cba886021e085e -size 628048 +oid sha256:c636a3776a11abc2fba0752c3c6b7a0a18714d8fe0a6863e353a7a5fd966c067 +size 619268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 39dd08ec62..240da4a56e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1866bf7c66a5a117ab1ecc9405559202a4b4938161ca624cc41f7cb4cfe290d8 -size 700650 +oid sha256:e5682a9c35e267ecd9d4f95e03066a0943366cb883bf2a051e2956dcddb6e824 +size 683778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 0d3902364b..79ba3ddc20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8573683e2df8934084d49fe963fa8b8b6c9841296aed11b4f2185d98d177369d -size 611001 +oid sha256:2165f0ecf1c66e8e458b0e449fed17078d6fce2c9cd30913743d29251b0a93f0 +size 603651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 76bfe2f255..1bd26aa0a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:905f74201b1a17d6407d43a2925a955c3f8cc99f024433628c76049a76e1e2c2 -size 716262 +oid sha256:e545d6e5b67bc9ca4172c41ac958cc7261db4f03fdbdbbf39b09ebb1f159eed0 +size 704324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 053fdfbf6d..5da99f1fd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39cde4e8bcafbc2341af16c9e92e6efaa9466524117de5f2b62559b910dcc6f3 -size 632930 +oid sha256:16735af87f14204cb1389568a1a261c30a61958f061e3d5f9fd8f130190daa06 +size 625136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1449d485d4..51e4ba4831 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c33fb9538dcb973b38c5d4ad68b4c0d312971302098160fbb5199c9d08ab04e4 -size 781510 +oid sha256:90cc2b465d681ff74839ad879cb05226901f5d529db158f909da5d5116fae06d +size 768142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 334d3a6c33..fdbc38bec2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d2db357e71ff252cd02f497788ab6b720c6e1285bad2b8f85e6f50e1ba4d12c -size 692110 +oid sha256:4dcd5cab6fa156bba2838c3b161c98a545462c9182102f82aa5482241527a77e +size 683082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7d1c4fbb66..c5dcfaccce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:803f560c60f91f301b27d3e58d16789ebf27ca0ddd251d07ee414238c0694349 -size 805510 +oid sha256:3c6e3f86c38c537e2699f06c4acb630da1bad5a1a8f731a311468f7c3a6f6fdf +size 791402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 74f3f4c4be..847ad4f4f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:428ca0408c44687012e1d63a30904c9bdcff4f9d3330df53b893eaa40f51a86e -size 714582 +oid sha256:0396bab774251a22062a3d669cad868e6c473e3ec22ddadb06911b4a8c4d3099 +size 704814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ca1cf92055..94a8338efc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbf0494f1ddfc3eac13b8aa5e4cc91857c1a50bc479b5d274d148ff99ea5caf3 -size 785160 +oid sha256:7e3a78256bc2773a41fec3b7375234e0cfa16c6a9712843d3835822e12232867 +size 766856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 77d5b14dda..12cf65ccb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33b5d85ed68dab3cc9f05f00daf574b3d685354d7011a4cdae9b1f3603df7561 -size 705576 +oid sha256:4a4fcb4e03b9773498db0171bd1d6ee3e7c22f3bcec1e7638117f2b8e6f61149 +size 697338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8d773d924e..31599a6e72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a30eb19e6ef3a6d3609ec1576b100ae2e8381b3fb0e1ea5834fd070f72dee4e7 -size 804226 +oid sha256:9912738effb86e7aaad8c52d90d593bc6e63a95e0253fd64dbb29e09753d6653 +size 790166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c989a90b13..25692d33a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3d045632779dc8acdac7c11a321f542fa9c1c1ba4cf9b9ac514fe970cc7226f -size 728294 +oid sha256:25c1ce4f4f0da375b761f65c231cb08238bb22fcff7475b537ddda2c72a961da +size 718822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 37f9feb8fc..d3bb9fb8b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ec1a75cee8b1a3bb8e76190f9a662e5d5986f114a2e54c345322e0a029b956e -size 692564 +oid sha256:8d71e9bb03c346b263a9674bf02fefff959858ec91e208399793c2b105e0dac4 +size 680824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5e3811ad2e..62b3121bc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52e768ecefcbde05db894f31ebafa0736c7189b09c3f0c57b8bc5b5bd2a3701f -size 605581 +oid sha256:c939b9635027ae39e899077617273aff4c0e9160902c6d366882f24241c0d17e +size 598329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d44d5a046f..a446e37236 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:977167edc014a4b29d408ff8ba01dd8a8f33f3c5a36fb146a0f35165faa9adf8 -size 717404 +oid sha256:2c5f7fe7e81b73549abdda3b0cac944e68a12c8a93d6626d677699c7449bd283 +size 705810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index d8b0d0ea4e..f5cc62ee84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:247e7b97d4f19b96daca4536ecbc60504c607bcf20a1ff5001d9b17b84d9325a -size 628842 +oid sha256:c7d819bfa726a2eb38f40e18cd5071d9f7097d1aa90b69fa48db6dfd10b9dc80 +size 620060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 96b111d654..eff2970c31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74d3f33632ad878790e53f4265dbbef980bef0b40f3fd22cdc3260ded6955e4f -size 701442 +oid sha256:869213a14d41a03cc044986f4d5a270100a22ae576ddf9cec30cbc0a6409f4a8 +size 684570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 8debef1d72..6d27e4fc08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:496b7c76197909c30fdee2fdcf1a2219ba47dd6aa303d543c29f7e1224758b4f -size 611795 +oid sha256:6ae3c1b3a324297fea91a0ddd73dcbd7f88c2749ddf5daeb764ae0d8245d45e8 +size 604445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 313108d5c3..387170aa51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8fd6191ad6c483a1f7e271ce9c9163e10e33b110088f148af6ba6c2a7e861595 -size 717056 +oid sha256:dc35cfacc5eecb567e8bb08555df16756d121a751626050891196837a7796c71 +size 705118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index bda0742ffd..ccfe96062c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:550e16e6b856c411887c2b9b9153fed99ee4fd90e9c1bd78c008fd814a34db40 -size 634514 +oid sha256:0869d6a1b92ef8a67e152da3293e0116ac347e5b6f60eff9296853d8ad7c4845 +size 625930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d040b7901a..d64d74f29a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0fea0e0b604a5ebf3acad2157a96d81868bcb6a8dd45582ba3efbdd110a283c -size 834692 +oid sha256:104bd18244b83cc884cf2a98a9dc46cd4442acbe8489738c7263869221fa734c +size 823198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index c32740285d..3964191a2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88f1718d22ded711de852e0a59326d43b63817db9a8545810570150dca4718f8 -size 808988 +oid sha256:4ee4861b6032295704e193473617fcf007eda4be704e5cf47457deb4621f172d +size 796852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d942637ceb..fbada9db8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5eedd9b17a9a06d6c7a9af077bb780919bf7d9560e186e850d8aadd2c1fadfdc -size 789058 +oid sha256:f9d702303d4ce09567c6b081fea6db47082b53fcbfd23d28c8b4808494b951de +size 777266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index d11bac18fa..486db91618 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55f436f7304be9743baac21117b81ed2e1c0d94428f34b3f764eb0d096a6c5f1 -size 765128 +oid sha256:2ede38240a585b6f2c66b540428baafe603331bb2575c7377bf618f5f2b1a64b +size 754226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 210201cd9c..c32d8b21ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66140ce7a8142714df0caba9e16ef2e215b258889369d759e13c82bd575d79ed -size 681784 +oid sha256:0ab0b6802923b03234d9c23d3325bad11b2905cdc0a6985308fbc2562dd64bea +size 671818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 57c470c4ac..619ab7b34d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a4a9ebbfb7b4593005d45b9169d0274138c1153595fcac168d93077be4e41bd -size 672062 +oid sha256:f35b8d789c246e0030e5e6016f1d118953324ce769775561b9cd03662c387887 +size 659976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1379eb929e..b59a6200d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9dd7f8aeb198d85d8c67442cc74a081e592df4d2a137aa6b90d3be2890a29589 -size 642956 +oid sha256:da316aca8d5799a71012aeade995b97849a711a7664e48663dc3244e9f5ed2b9 +size 633238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index e9a30b253a..1dcde87dfb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b89b004607f3c124be8316fac5ac2b4d646b4b5aaa640151095f9bf59be7847 -size 635012 +oid sha256:8143a7a0f41072cb036aeb48505cebdfc7eeed35eb5c855ba21b9ee600b4b46d +size 623122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f2a190b156..3dbd7bd414 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9aa762dfd9718e8626ed2a19a4aec5bc330f9a1a5120d67efc203a68872fd4ea -size 645606 +oid sha256:ee978d5c811dfbe7eceb5d1b4cefd85e2ff91d99c0fc30e667f589351d2d04cf +size 634950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index e4c0318c60..699d0d0348 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e6553154bdbd79c5a4ff00523d127e9bace90693c8be62a6331e6cf427d775f -size 663094 +oid sha256:271f2bc3b9b73cfd641c0944a2cf3fcea1ba5f936c06ea9e92c5a68ebb975546 +size 652340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e74c81a7d5..61ff631587 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73dde437d67a652266b13af7e7d86a64780596e54a5a37a5e5b60dc8d5ca1654 -size 822994 +oid sha256:78f35e006d13f86df92b9b86e251886eb37a9ac9d58e964a4c69ca03f2d6e735 +size 809674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index c124b2c546..29c9bf6594 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3e79a208a279e3066d21f75b7268c7fe0029d6f10c07e68aaebf8e5a69dbb6e -size 798080 +oid sha256:f4f51791b27d6bcfc2c53a54d97b1cba129f441b058631a5f19903fa5df06b41 +size 783328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1d3d4bb392..836c6c8d70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c8f3473d7e50c1c2a78ef3623965dce2d2786ced0b7a89de629e2bf16c29318 -size 782342 +oid sha256:a79f8f9227d4d5bb0ec3fd58da71a579252eb4589f9388f483817b9a74d99316 +size 771242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 4c5ffb7217..169b013a2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:197c6cddf099ebb09b0b2611122a5ab3324593115fadf7b8167cbd24b59370aa -size 758364 +oid sha256:0be166e840de83bb9486ef014c1340f34b168b432adbfcaa0b273a7bde6f74db +size 748202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index a72b7a3990..dc06c5032d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98c16fc4bb762df6403080573fc261b6a07b5f44955c00f44d6a678596afa566 -size 668506 +oid sha256:4441b080cec4706cc6cbc023ab4394d73272c1496231a8d0570a3019350e990e +size 655038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 77cbb06536..b2c285359a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d289fdbd9d3b69e51b4ee2c383119966a4af9db52556c94ba5698705c30f663 -size 658786 +oid sha256:2763754240488074c9fec34ca8aa6b30e3c337a25b256f8e03d44cc8fc40007a +size 643196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 760028cbdc..a8ba435df1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ace6d731d8332cd0124c7ab7b49ad42711bbb95725831bfdfd29843c7d3f7fc -size 635698 +oid sha256:977a486bf4bbfaefbd99b191d26f2ecb4d93ef4aa1eb5509b2338cc024681971 +size 624500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 351d5c6028..f9c9704c76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5d7a023b5a98719f0819178d0baa0e6ced8eb8aed32cfda129a9576fa2dc3b9 -size 627852 +oid sha256:d55cd933c5cbd75ce9dabcf200791f5f5ecbf30adce753a4186567437d1d489e +size 615271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4b8a7347ba..6862893aba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c50c4b6dd195d35a7c64bade298edb494c689292bef19aa4c67c24247cbae788 -size 834624 +oid sha256:5868f9ee31dfdfb3467f3e4dfe6697972781b3bccb0204e1fc4691b6bea40caa +size 821352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 890662ae96..155c460c36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c04bdf49cee9c368842ea571832e9322ee4038fb75b49a80c2f6a5005a776a51 -size 741474 +oid sha256:8bc62745c258bd4f3689192851f2b2e3699d01ed5cd57f0929fca07ed174a498 +size 729240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index ce1acc9e2f..94311c319a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab474a4fc80f0eb32aa07a69fda2f6cafe9703a337c1313caac7620138033449 -size 812718 +oid sha256:bfd71d136d9d6c70a2e27aa4e403fe67abf6a5efbd81ff6f4beeba276cf47c01 +size 799546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index fdee343f02..7e24249198 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e01b82735c35e11518a817445f78d9dbf120dd7367755a82b64385a4eb36c5b6 -size 714338 +oid sha256:7677410e300776ca98fd82286defdcff649952ded4aec859fef91280d78efcca +size 701118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 877c5f7f2b..cbd4398471 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f630610356a4b72c13e9a984466dc053f33774bbca575cb98d62a493489126fe -size 693826 +oid sha256:e7adf99151eb592f0ae1f547b228a72d4c8e28b216d6ae1901c7278f4848cdf5 +size 684304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index be15ec17ee..3ec1725f26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78aba707fed9399d67d8008beb995d7d5d48dc4c59030ca05af6eb66c92dc5cb -size 609999 +oid sha256:9197e81d179aa2635c099866a73bbf4d67300fac15ea001d367843e20dbffb1b +size 601317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9e79a5a316..5d34242137 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6a4e42552792a5f1dfb5150f9cb08264a119dc12c8b9702437c26d8a7aad99a -size 797178 +oid sha256:8bb260b4202450430c4a7852ddabf60acde36cb37947016a76a88abb7356a449 +size 785288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c8a7ed38fc..ded3e93135 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2818636b287b6f3b8095f5c4db1e1f3732a8e320c4cf0766ab40b32ee350661b -size 704966 +oid sha256:c15d012c1e5547f7df2ecce08db2e8728558edcb7758aaf2b5de1684438f740e +size 695098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 6e88568851..6117659fed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a65a0b339625974e6ad948815d6a9a7a094540db918d040f548d97a14bfecef -size 777688 +oid sha256:9f0e556b6380ca3482ebded0f59b599cf72b6785d8ce1250933ac105f55270b7 +size 766688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 2ea3299cac..423b70689b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a255cc9bbc78b0000e7024abe312cd3641f2e16563fda30e58fa3afb3ffe67e6 -size 680396 +oid sha256:e85e873cf031c9614ca6f8130101eccb35039e71acaef33583858f10173a78d8 +size 670332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 93c8892dfe..73ac91972f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c74be274338178bfb94edfce366b61bb613b6c730f2d830219e23c7ba50d5e24 -size 683786 +oid sha256:73625df1bec287b551ab9cf4bfb83294998b337d5091d12dd6dd02cdf35e3252 +size 670416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index d0468542c9..f6f8ed71be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:069b044d987a80da893c38d910f60b31025a0c62c7900b80306d1285f3d93bac -size 586739 +oid sha256:4b465e160486da3dbc75fc6e7765f88856f2faa6242eecc1ada7fd159f402eac +size 576231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 42dba148e4..b23ac8ea4d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a137de8893936af12e756e6c37b7f89d4135217868fcfce1c321e3e90504dcd -size 675890 +oid sha256:f1e73e923ff60b9a0fe74736844ff0444215f6e35be62eacc23e843e648c01f7 +size 662472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index d26778f038..244b89c108 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b259e1345a597d779b88043aee6127e710b3a329c0e4a407e1c4f85fe283df98 -size 576229 +oid sha256:ca7204166226546cffce3f02d6ebd41c0c35ee60129f5b8d26bf482253518199 +size 564339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 8af36a5134..790b9e3a11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df17d8e03e6e9de76082672cbed67dbc7a3cc1ebbbd3e42d79a42fff09b1d1fd -size 715606 +oid sha256:8df1bf446184e797db13a2e09f56340f8614c10319848f0c24682eec8355594e +size 705788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index a752555b0d..30d3516e78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:634f9d8e960700bd580fd043ad6a810a5bc26be5c0642ca16c7fb9da7a9f3dfd -size 626700 +oid sha256:c628cf785e9cda13a8b19189dcbb5ab88f1dc038e6ec15a745b0f505894008ba +size 618608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index fb0fae50ae..b1142230dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6abb0e52152c62338212a398b7ddfbada18b4f46df19d8873edd8d29d21293f4 -size 653198 +oid sha256:9fbb0b28da026d9a570d030938960245a8339ffaff26f0c0677603501f999c1d +size 641950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index cbdaa2f42b..789c3ecb18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e345bbb7ba59bc1558027e598b88cc781d9c00a02ebe1baf3374b38436132971 -size 557827 +oid sha256:0a7b3d088c98896647ab10519fb811980e34ab62c1923675472583ea2e05dba1 +size 548453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp index e11f742541..ba92413571 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f43471d1fe3d92c398f1fad48aca96e35211d2ce8fbb15c9f1f12e9b00c544a -size 648262 +oid sha256:2c37d6ecf3c72fcaa5ec9be0ee1b7159b61d191cb37444d61cbe94fd7f3a5d78 +size 635386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 5589c70018..bfbb247bb2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ca0603950d9ed4bc8056e65f95732f7537d8a7302aa73d249017e91d0f93a0c -size 550721 +oid sha256:f0d4c1a03cce3d4802d9f92a40824ae9964f04a18a5972fc1d208b22d06ef32b +size 539227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d6520babc0..b4ee169bbf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c784f996942b641f108eba06b288a1256007255b3b6f9572b4aefab0afa0a09 -size 740564 +oid sha256:3d1f9ccecc549250b0ad6294fc654a94c1d4c9142b65b6c61e54535a487df868 +size 720980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b7138ae777..3df828a9ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf2ce9e33aed06db4e8fbefb4a97f3b8903f4343771115622ff6cabc626ec24e -size 699370 +oid sha256:34408e9000c8af1dcf6d35fcf930a61fa0784b4f01f4d35e8a60ebf5ca8892df +size 683978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1e064ed654..c3632ff2b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9a9cf759837162fd6751f046b88c6faeb7b62ebbbd6bb8757a96ac0e70cd704 -size 728274 +oid sha256:c6d09ec99884095f1471abb22825db0c66fb763215ce8eac578222d326cf19e0 +size 706568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2ec261083b..e52ebb2c6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53d187a3117d29353bc6153200da37fb34d12de99093707c35c1b9fbb388620a -size 697982 +oid sha256:2b0207492505e3d8015b79d87c8fdccc53fbc068a1caadb42723b342717abd90 +size 677558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d5b9c62d7a..e8b563168d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49e40ceac472ba845a5bf4cd7ab86b645a607fa23e40d63426aef6b910bde6c5 -size 740396 +oid sha256:8585396d3200b7c7d3ee9cde2b70d666485ed906efa88397c091d0f2248ab2a6 +size 728852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e3714b9a34..71bfe21fd0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4d64ff822daaa2c88333e8a632e99fafd7a32aa70de8e93d155c65b077b6c5e -size 646162 +oid sha256:616a98b290ffd6680abca97361f80e581e76f93bfbde242ecf8430a836ef31cf +size 631412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 77fb4a4c5d..e037dce691 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18b5c98f027494a3f4b1e76573ed484ba7ca51e64c418269c5e8df84759e34cb -size 712472 +oid sha256:9079c43ffa1c83a120de4789786bb88c1e66af757af1d973d779109d0955b73d +size 702802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fc9a4977d3..cd3c1f5da4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb61cdb7052254e1f24dd81d2af0bc40f9a1624179b57a162dd1041e71b1e094 -size 621296 +oid sha256:68782e71d500f2db07ba84a3b201c24fd9dfd5845eda8ff243cec87ee3d97a39 +size 606841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e6f0829f65..ac554fa48d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cec643006bd7f8e0ae1441eb21c8c26de606a45e4fce071218292cdfb5d3f52 -size 890686 +oid sha256:92325abc205e35500c6a83e3727daaf4f5b488dedc8b40ea3cb2f41a4e2c1439 +size 877612 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 8111045c7b..1ec314f0c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3caea39e830d099568ee46f4a64f9ab1f7c7f160b311494567eb77a01205633 -size 856250 +oid sha256:dd9673833af2b06773d74c97d4dd810d5057d866663b3f31ed3b5ac6d95c1fea +size 844212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 245468638d..62f00e0d77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1c6030fe9bb35540f0948b197ed5334365ad6fcc296c047f8930c2ba1557e5a -size 848258 +oid sha256:356bdc990403ea667ee4e524abf9283ce3ee82bf0b7003bad3b0724b28a53069 +size 837404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 1b3c108cb2..6de61a3f28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:583257cbcf71c70156593c7cb109f7a9ae621a460efff4062e6739fdffcf9871 -size 815596 +oid sha256:d148a61d56c2efb4e67eceb42eb4fe20bb9661542d0cd6692122e9c086dedaa0 +size 805780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 335431f28b..a2cebd00c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc0d83d2c9ebc042c3d5b64082430c834963d309c60bb84d5994f0886ca9a12a -size 715774 +oid sha256:c09852272925a1e507715d00fc89838724b3952f570e9702109123a3ac4992ab +size 705068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 582c8998b9..eab5d09372 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5b1c923b609b0c65f190ddae9df450478e0d86409f77b1d192d513a44cbda1f -size 701810 +oid sha256:4421e3f59bc031c657d047c524fddc25155f82120960518a11a83ba99771e06d +size 688934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2e69a6b8de..550be353ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94eb4d8ef30ccd6b7107f5aef27d32a563dc38c3a3fa3c8d6b7be18e1026dbe0 -size 681830 +oid sha256:7cf79311d897f972efc97c8a66f40f09d7a634fc3208ae690657ebb14ecdc791 +size 672112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 8791066304..9a065c94dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:500c2122aa200dd9dafd43a3bddc1f6f713ad5c7a2a67f4406497290253acef1 -size 669692 +oid sha256:698d69641b29ccaf4ff4a5b05763e9c9b986ee972ce22cea30065c7dacbf7f14 +size 657852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index f5d730117a..8d351f7df0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:614322db06421425e2ed13e75dd70cdfe7a1629a2bc87f83ecd1179d6e68a0bd -size 666384 +oid sha256:bcb75f5c34b31b2b5e574d4300aa7f14ce277a97334dc06dc4d41e9bc83060a7 +size 658688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 313c3fd123..fb49662d54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fbf162fdb498b5feb191e96d7b2fc1e436db7e887a8a37cc48ec19e36180c68 -size 695680 +oid sha256:63319bced3de31b61f0c82adfc1a310249aea716ef09921165cc4bb6b7b0efb6 +size 685024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 2be8b5effc..ee112357ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:173c1d5ee863d7cd64572c3222601d80ff87d944208862277fde7917f898bd53 -size 686536 +oid sha256:30dee9528aaea91f5488ea74181b3698f45ba0bb460a0273efee662788ac1533 +size 676224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index 0e34b5452b..c1073d5a4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38e079c59ea8fda5f680b91aab3c29e1563affc9750c509601267a1c0639a2a7 -size 717854 +oid sha256:77e621c7f68d790945b44cd22feecf89b800ee470912e480075ed9b9276795cc +size 707100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c774d535db..42e825f39f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f7f4cc188230b8659e82906a49ac6e59caa0c4710bd1a0dc4805f072e3ebc4a -size 875682 +oid sha256:371b1ad9b7a0a1b5ad057a3f3a14eb4a7b97af80c5c9da364e0821ffffb9aa94 +size 859994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 329e8555bc..965231d5b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5f58cfd6ac325213673f7f37663e4272990e331aea89b6cee3ee61d2712f419 -size 840506 +oid sha256:3853da0ffdf88cefac0c0def525e741410a225938c7baf9293a11f6d6973c08c +size 826594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8c6236afa6..d740852268 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a5d29794edbd7674adb8a413d20319b05443f5d409074343bf883093a4dc4cd -size 839026 +oid sha256:e3ee4eb5f4f86a89d8d481847c0420c3d0842f39da818a817b74d1eb2bb2fed7 +size 825064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index dd7fa512cc..a1e4cb594f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f77a73bc7a205c34f365927a939f57c8d974ae3c1ff211364c401cf65b9df97 -size 805280 +oid sha256:0f5223c42b9dbb4320c04b419343d531f77336eb8dce77bf219bd6c402132c2f +size 793390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index c7cadc3bc1..44bb6d868a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a7a961225962f5b897bcf75e350b59d220432d359ad1949d5de3a275761aa14 -size 698304 +oid sha256:bca265e727d1d8bf9bce26903c33f017d5a13992e6f8d919f83580a04e38660b +size 685626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 91047b6ae4..27456ed699 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8c0d0d02637e4e5ee29693d3e6f3952d31b5700babf9b83d195b02fde937bd7 -size 684292 +oid sha256:5269c0bc36d46b76db1d6d3f287f7147f93d5a4db602a77d9f2d8d078f8c1159 +size 669492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6a1cf9d8b7..a20b726af7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46f1c720e4c96d3877b8ae45a9c2d4a4b02d317e7ef0d10bc7c9b2c82c31b872 -size 669984 +oid sha256:03905a05f5654404f5b95b2ede387d5695dcc330aa053d028b04f13b32b54424 +size 657552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index bbc6de844f..9bd49c7729 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53723ff1d0e1b4fb016da6e19262c5ca9ebc892db746bc1b72ee427e5a528ad7 -size 657846 +oid sha256:d79619d35526064873710b8340be35d0841d1af5f892b53b2a0971fba17a60a5 +size 643294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index effb24bd06..52536705e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b2648ee652d394f425184715e1bf7bd1a47abe98e4c2617167e779abcb8b795 -size 890272 +oid sha256:e6e9b1a6e73fe082e6e6450ec370efbdfd598a211f3393497a1cbd357d1c6c2c +size 876064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 34bc1188a3..7e9d9be816 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61e291bacf02b3bb91a59623ad7f2fdefd94cf1c1cb0b77b5c8ee2bc5d4dd4c2 -size 792682 +oid sha256:c09f114c39fdd3ae25b585c677cf3046894bb955b2b85c37c3ccb4145ee72574 +size 781780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 08159645a8..c04f94c60c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa065dc76a687a8de339ed4d12fadf8823e3f5f2cebb15fcdb8e67fc5babc750 -size 859486 +oid sha256:5f478899b1fefc55371eed869ea2802f0c1c2f3482841d939b16daf5618b4e9c +size 846314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index b05c62d0cd..f6dfcd96fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54598a1fc214377eed779dbaef53c49f1284b6b256af30093516f250adecf521 -size 757702 +oid sha256:ac9511c8def16b45f5f25917d340f4e52eb3d01539f397a23111466e3878eedd +size 746602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 1f2ab3d740..6d8228b138 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc4b67edab2db15f32adbeac8b0e8ffb6b565c824fdee65921afce9689fde634 -size 718796 +oid sha256:d9933102605aa2edcf7f781fdfd16282a70b5669eb08b3504f5ef37f474301ee +size 708388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 726284d1d8..d70fe57880 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efcb6451b57c35b1eb873036c3d6d1cca53bd87bbfe11aead08cd27f0515791d -size 756084 +oid sha256:a4ce75094a7f2a493ae555de89ea967b05e9a5a698e4a89d03f10c14c036a72f +size 747748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index f84151a2d8..b9b746848a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73970203ccd3fa53e640f71a43b129399bf246160bcb1b1dba7ea3911342a513 -size 628212 +oid sha256:e7294ac95aef7cd4486f2514189d19ba79b2bbc7f564bdf305ed30b052419172 +size 618148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 546fd5deec..b8041fcb46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:122bd4e0e2ac39ad0c024e27b651451eaead76d28991dc4362611dc194f6100d -size 660124 +oid sha256:5069878708c72032762dbcd34039b164e1ac5d4400071b154dcc0a323d07cf4e +size 652180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 304b7f3f8c..1337622983 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0301ed1ee38cc1482c802271359d63ec656706f50bc7631b222ad4e5d68ecd8d -size 846166 +oid sha256:a80792bebb83dd88031c7282ae8a93295b1c4230995d3c300f38dcbec62acb5a +size 834276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2b052d1be1..dd3826a567 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbd4a477eb08fdda0d083c0a40b74269c1da0b1ae258240608e4d4ecb57b46d7 -size 752522 +oid sha256:10a2fbe43ff815675f5400e7613285565b68a252d867a8fd66a3e3c4191915d2 +size 743446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 652fd97d3b..60013c6b02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa6fd2ad3a50a468d7e419bc941235605e08878aab23448ac40a42fff2adefc4 -size 817156 +oid sha256:6f76f2d5afed817ad7693d5759d2dace1a9a83b52ecda077ae5a0c83a95d2271 +size 806894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 268ec6be0f..f9c55ac3ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05c875fb197929b7b52c83ff18d459eef21ff9a681d6b18b266cc5d902b19981 -size 719270 +oid sha256:4b01985102a48432e35ca45e1ff2d2402695f95a39dc88905e3096d2787eab3a +size 709996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b8f196dd57..5708b27d81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f33e5f35f682b85a51b410fbf33174e9fa89972505173bdd1f1de5d7cb61e22b -size 714620 +oid sha256:f0eaa455530477fbb1bbbe9e514cd596c91117662b671ba6d0deaaabd71df161 +size 700116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index d795252528..c3d908a790 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e84db8d1799f3e4c52b5e2d88d67a8910a5b91d87b7a445b5af9b4ed37dafb92 -size 617325 +oid sha256:63ff3a6f5bae069e9e75f931b10b5a7221b5d1dc4cd1572a3e3e614e0151c659 +size 606817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp index b9d011560d..9f8e9e3087 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce1bb248d6615aef916d9e32794b28303bed9f898c416997fdd275e68e84cd3d -size 702334 +oid sha256:af7f98b05ef4b7ad2177c1c49ed8e48dcc6ee45281b98bac396d1947f7e8259d +size 687040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 5797164424..8acc88eb89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a1b5b1c8145f484759234da2000af3d309748d9d3a1687aa52593f13bbbc2fe -size 602523 +oid sha256:b52d1c2b5e44cdb83840e727dc4c5a9808d0be202a949007b71b39fe8f617ef2 +size 589893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index c920745bd8..e6b95c8d41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a1956cf265118937321d6e9cdac98ccaf315b8d013bb4f4b75eca2d8fa83bb4 -size 739048 +oid sha256:d75312c3082ee0fa4f04c58b15f054db747e0563d72422a8ee9522c7a126215d +size 728934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 5620974e42..5e94548d57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e8e8ad6c2af06e21bf3ccd8d0cfd66d1b7915edb752cdb0e07347027df2166e -size 783636 +oid sha256:9a766c76980bb54a709bdc9bdca96ca87790c1d4cb91362be153fe033caa9462 +size 773326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 8b6ca28ca7..1788a5840a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc5dfba4b181c1de9e9c854e384d4265303f44b52a4abc0f9156d6a54cca3ef4 -size 648956 +oid sha256:29f800a7f165f3f1bc417ca23750adc4c0e8ff582589c02457059a6259c1a15e +size 637560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index c3b3c3f907..9dc9c46b6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb1a4f5121f151ca4d1d8230a476dd5a54b689c192281fc1bc76eb3b5c9bf98e -size 682298 +oid sha256:6abc5324da10ad469e8b01c13e9897a8cbb73229b12d0cf5bd40d26c3e05793f +size 674208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index dd9e8eb2d5..8afb62ab9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:378a1b9244615d04cb210132e3d6e5f031008bf368e7ceb06db1d95818066e18 -size 680726 +oid sha256:346ded4e04ff4a02ae9730b270cff7c0c3ba025772366d274923455a07e33e8a +size 669478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 10f0fed1d1..aee878bc27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1c90c38437294800dbc182405fd30156e6c7f69024ff379b2a0e2fcb01c2151 -size 585601 +oid sha256:fb18fd7c533b061999aac487f2059ca24227dfc1102e5299aed1457fc5363ca9 +size 576377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp index ea3b184bb9..9c6bf915c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c93ef3461d2343b0bf15ea5f4ce928ddf91cf4e31c378be42c41c523ded26bbc -size 671400 +oid sha256:15656340b7b1e90ce28dae4c9840a81f47209cebf81a5d655f7a111779105448 +size 658524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 57f929dfa1..eed8386139 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa00c8bbd2677454d6971049f5bb5a3f1e49a834186c89cda58317e4833406e1 -size 573463 +oid sha256:a3a377bcb94c030ce763f12e922e6e9b07b43f58b01774bdb2b94a1616c2e42b +size 562067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 379c135cdf..427ff6ac0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf39a4bcef901c6717d511998de8a4a563058cd99f4b3d0900942f4f7faf150a -size 783534 +oid sha256:dd86b20f024a44f51b1df0ef71bcc8ce52e79a9d06142b7b39640bee2defe53c +size 766958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 93dade8441..99c7a6573e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da6f4ef5b0ddb8eb4fb873c6f80436d940dd4e1156bc223be62c627001daae47 -size 752106 +oid sha256:c5cb4136b900b8269421bad478860ef7330bd1eaa2d594a33ea55009e7dd55ba +size 735926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 2fd105c99c..b9a5c1155f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6881c2fb810fc3ce84abd6be8f9d8e161c44b855c27543fcddb0505d87bb01e0 -size 732144 +oid sha256:3a2d6647015b3d3c586c3ca0e6308332c911656d4d88b794f0745ac1a2c8a6c2 +size 721340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1613d38a2f..1b195c58e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca73fa43212184ecca2b4b20c97990237c1ac799b85b6e9b675d045501c3d713 -size 766410 +oid sha256:b5ec5b856d362012c63d6d438958192304465320566828f9892c03519e986941 +size 747712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fa0696cad7..5868b57814 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af10928a4b102b659d96d582f5fa6d37c53e3b0c22694825e925312a32883e60 -size 741148 +oid sha256:3edb20ae274f70227f6f2d6201e7fc79878ee8b2ac7de503b14edcf6f28f226b +size 722304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index bc53d2e2ca..ba7214fe8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:121f37599b3facfd228e65ed65ff3f7230e6bd2b8d1074a56200db5cdb642442 -size 780456 +oid sha256:62adef352100e61d77c61be17c55d48569a30fa1f37d8ee49268e8e60db258c7 +size 770046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3208c75954..5c84c6f4e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a73ef836320906f50f18bed7c17d4279f92ab1fad523816ec738c52c62f9aff3 -size 687948 +oid sha256:e3aa2555fd78d8c5f0aabbed023a8a53ca90f87b2824df5acc157756190d24ce +size 673888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 97d22cdbe9..9fd65eafb2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29db384ce321efa15724140a81195570c55d029a2bc074fc30cc5ab457d94491 -size 808090 +oid sha256:6c642271620fcea0a2e83cb304e0ad395a596c07d53aacaf0fda2eae5680822b +size 794572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index bd3ea34e4d..376565919d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d23d3ca5b4e5ab1b1dd3c5533a03499c9c144e89d7ee66eb377c2a7ff70c31b -size 690176 +oid sha256:c8c2eb6c1e4679db408df3e02535f5815400817c008444279b75fa8bf09c6b9b +size 680260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a9cb7496ed..945f1dfee4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb9179a1e1cfae0377fb2fe7ccafa71b7e332538209aa44655786b4875bc763c -size 749916 +oid sha256:b64f0b0af6d797b34124e9de282b362fcc1e44e07cbd7c0bec0c7e9bbdb074a1 +size 742566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b04cca5d70..17c5717275 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd2f6d287fa0e40478cdd93b5a0413670f1689d38dcba7939fde8d52c2755454 -size 660368 +oid sha256:3038e09e7fa2c94a896a9f14f037bd6de825c0e1b6e70e4a49fc53257293b46a +size 646752 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4eb69b9f83..cfdd3ff98a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4dd07c7fe11ea54118fbdd87afab3d4aabd88d7ddeb602da25765fb84156147c -size 1000134 +oid sha256:d2ef5f23b11ca1a95c4f45bab7adbe2797854d3eee54c38afe7801e0c01e4c9e +size 986222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index be2b9434d3..3d78e8ca95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bc7b8d14eee312a28a69d29e659eab7ae5ab93a51ec9f71c65578b1d6dceffd -size 948284 +oid sha256:ef54a738d0d988774bc8fe6e958fdcfc86a231fc243a104d30d486ab7696a333 +size 936444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bc7d7ccfe2..557c888585 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:997c493e72c3dbd78234f0628e4ef4b6c9cf7e811506b9b6a3ae41a1acc3b0b2 -size 944880 +oid sha256:1b9233d42907c76984120cdafc65ab4aee2dc3eaa7ccf944892a69c190f74cd8 +size 933928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 0d95f08c62..5168f36475 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a2b40287fcc4bced74e8ebe7230ffe836e9099bcee1eff2235a4fceb3a8928e -size 894754 +oid sha256:27bff06abc2276486a2a7adbcfeb3e6b4edaded17c610653c12e3e7302a6a68c +size 885924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index d4d42855a3..fa5f206cf0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:716816d2d15edb7626cfca4368373efaa2ba88ae4a990415c2485513f3364bf9 -size 779984 +oid sha256:bab3e121c492bf1bc69146b449f37ce919326e64a368cec074d5e041fe47c407 +size 769870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 27f3261cf1..68542e00be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca0b307977aedb1cc50dbb1f291708a79a77867ecef6969ae59953a4593d0029 -size 757388 +oid sha256:559436d6913448e9fea71688d51f07834e89f3f990999a1d8bc41407dcf3cad0 +size 744314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 16ebcd5c68..28fc6a3056 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e960223a767c0c7c57c2429646d761171da45639c368cb1bda4ad93cfe1dcf5 -size 735878 +oid sha256:36dc8c59b3eaee70fc4cf759779f33c9143a76934125bcb3d0e3be57f250ef1c +size 725568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 88076746ce..9935258df4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66a4b02c6eb81dd6be8b4a25874d7a010f3bff13016abe49c39f859ca1666ba9 -size 715896 +oid sha256:573d2b23ab16ad279b7af4f58505b5b5de0b15dec9208e408eeb60b20ee04ae0 +size 703464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f9c1518cd8..b9de13863b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e28b323fa22dc1714c7ce8dca509bd85944b2c148bc4f27aac65540b92036a2d -size 790624 +oid sha256:9f2cb89d9bd8964ba49d1f729ef0ff3ceb371ba8b64813fca2f895a2fa27dcd4 +size 781546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index 3692e3f09b..2887010b53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb4c706b48e9d6c05ace138219bd083ad689d88aa956cb304f158308f7d632b2 -size 822270 +oid sha256:779f08125251982d945a2c175b0f427bcde1c5033939f9ba27057626d94d4ff9 +size 811368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0ec564fa50..8021cdad7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95df446cccdbb194ba875651b975cd7c7f30e41ab46219c3d3331c1ac001fd9a -size 975314 +oid sha256:39522d2de4c85dbdde8d31d373b55f7f815a57cacfad7a4f3bbf94e243f52e82 +size 959528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index aeb3b795d3..5bf4fbf7d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8724c1944903663e20df2dbd679d5dc37979e3c19efba66af69a520ddfd923c6 -size 923462 +oid sha256:baa726a569e49d9a2e82d296d19834093029d808538503f29dde1f94f67facac +size 909748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 668e898031..85571b8882 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49228c3301fb949fb927f030e05691e86ebf359ab59dea531c67436b7e5cce76 -size 931010 +oid sha256:a439ac92b1c9dc3ad3ad8505c46a778e77a7bd1af19c6c1f2ee863f5c7d84972 +size 916210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 2dd4260211..d9b9efedad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbc02c4b20e1bf52718d3771d08f38619904b013d0332b4759bc8bb938194a5a -size 879850 +oid sha256:bc1f4e5dc1560339df37c06d8c32bc2eb0a7636adc0becbe16268d07d6d9186d +size 868948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index c6513257b4..3c96d220a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3161fa440c911bd4e738f6c1bdf57700082ba372ce851f25d46c931c6917b9a -size 754324 +oid sha256:c7977116d962899ec5b9d14cd4bdac6dbd7143e8d296bc3497f7009a3bc7a0a6 +size 741498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index fdb49fe491..1b60f3f860 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc15c78acb404a76c5774ce37aa70e22e1d6d3e74ac18678a93ac8b03f4520a3 -size 732518 +oid sha256:68d0836e1d4b6ca9fece0728ffde8483f27f030043e25fc0918476eb0be1b2b0 +size 716730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 561d0f888f..b83c06d587 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d1004a7478f31ea67dfc6d0704fd3a934e304ef64a2e8c1bbef5059904c4a33 -size 721220 +oid sha256:588e175dec67342490e873bd044ef557dadcddab9c60ba80c7669178ebaf984d +size 707802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 954213ef67..105a139100 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afc3c51e4fa55635e2b3c1f740575730120a8b8ca9f0d308219f7d78843d7353 -size 700400 +oid sha256:20cc2288753d5dafa5391201f7f1fb22b431d7f810ada4d290bd1f3047b547c7 +size 685648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7350854e29..a1ac9c77ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69f2cac73b258fc2e316594451f1092609bdc887203db6cc5eff0c5b9893fe82 -size 990544 +oid sha256:4eb8e682dca5a00e4ad7831b0de5fa291bc6e61913412709daba5c83b02375a1 +size 977174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d23ecc59be..b6cc098b6d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afee26974866d89bf225a0557d85a26a28ae2b5ab31bac9fc4783095d10b5157 -size 894040 +oid sha256:e328398461bd1b84fa47dc0efde774dd02c9e6042ada101852186e4108becd5f +size 881608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index fb48857b1a..31b5c0074a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2974f6b074aa83c8e3ccb75351fbb8491f2e2a55f9a590d3d182d9464165a6be -size 942344 +oid sha256:06cf1347f68c7eb0e621cd2ee496b9ae883af540e17c562e80909f5efe102fa2 +size 929122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index d5b8fdfc9d..c8bf6657cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c8c5799a4b23de476424bf855067d725e0a2d11d0a1faf5578ca5fef86cc17d -size 841004 +oid sha256:cbddb01eca6dc4023254338e3ae78d6d30461d7389fcbb36b8dda217e6c95205 +size 830004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 315d5af8f3..a7ac4ac5f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b49f2813422f73dc811735f836b58811fec370b064e30b012b86426f6d885967 -size 847280 +oid sha256:8bf1510a8a80d0159d151df025111c3abcc7d0806d9ecef5165f98b20ddda9c6 +size 836476 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e6c1ef3caa..2975609623 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b7aefdd66502b7dddb41827619624d4ac1200fdfcea96552ff42575905f51e6 -size 756646 +oid sha256:660554ac619996091844b55295917b9d116a75464583f427debf72a623e00f05 +size 748852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4b03bd59a5..4e5acb95ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e6c17addaa79c3a6ea1d42fb9db686bffb43b3ba6f630abc592016715443afe -size 935092 +oid sha256:ca653a4f0f424a5fb240f609decff164b94a3e1ccf40678785bc7433561ecace +size 923152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9bd7555b26..b47efe99a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65cb049837324e9427391ae28871ddb0313a8c10777af06f0b0c43627cf73a73 -size 845988 +oid sha256:4b34480e3d45f0f549356ad52f77c6e46d320ecde61c1998ffa84e98a65e4c9a +size 836910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 97e6307213..94a956931c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dd0d0433f90cc25c27f2762de7df57cafd0312411daecd3d50db5c79d47e719 -size 890196 +oid sha256:c703cddbeba1867a371d3075cf60ab4ed44156cbfb38a79215cef0b39e462280 +size 878258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e99c91f590..da5a2ba121 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48ddfaf473c16da15cfd92c2755cc12db507e9233fbd7e272a6139ae6def442e -size 795518 +oid sha256:3c2c3237d8251d3832c78ffa87822953075418154431287be816f6b7479f92c9 +size 786194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 53bab3faae..cc41d35aa2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b395eb57ee389ebd8f570f4a53230d5c208bd886a1171747b679b43d69f23c31 -size 775968 +oid sha256:4fd2efcc8cfd3c2b673ba687e4fc57100d12bb76885ba85d7ff04441b992585b +size 757024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index cc3e01d337..862b2c7736 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57e786aeee4d381d5869806f4b93165c8166512fb5c6adb9b0c0d68e817ff28f -size 673298 +oid sha256:523f5e0da3e93f9c373dec64ee2bc9c8a1165910f492057eaf8dcae2f5a4ef08 +size 662790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 4c485c4c0b..355946f297 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f62b9cd9e7dedc92add2799ebfc8a5e79d3f1681105191cd49dbb0855c56daf -size 755246 +oid sha256:2f1a99c7ce422a4c0ca00fafb1ecfe97c021844b7154b434d203c875d604f674 +size 736006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp index 7fa891f5da..5242bb9624 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bec438c01ec97e9738870bd4b2a68d32294933940b2e677bd92dd70061788cb -size 649912 +oid sha256:c8130e67ee473fd536775efd21e3828e2b71fbbb77712f631fe174e56e046ddd +size 638712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp index bd66ad2d9b..71a49d2510 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c15323b034a255abe5efa5cd23c6e3d12fef0ae1fb2020d9b8bbf7d8598b7327 -size 878926 +oid sha256:b0fb881f9a73b1981a8ff443f754237b11e46641b1cd48777cfa51f4a59ee237 +size 866742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp index 98e78d6ee8..d1e529c02c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94a0b5cf8b7394f49e0750f70f102107dc73cbe48f8b9870051d1db0fd5234c8 -size 787306 +oid sha256:f6f0c0a27cbfb0aa4a8524bf817a18092921ced8cbcbd5509538f9558155bf4b +size 778574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c0210a08cd..c479c8396a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1cebd791f0d263ce69c3dc4586744fa961558844b1ae8fcc111153484c87892f -size 732256 +oid sha256:a4faa93d35d34ee14f5208912c37b1ce7cc9196219c66fc944790edeb9b186d5 +size 715632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0acdcc2a71..7b5c4d3d49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f22f5dd1b3d223298a26b25c79d81685b912a6124cf929514f75cd28758a0481 -size 637232 +oid sha256:0ab4793874f583054b5bd7bdf1f547f52d2be09aa6236b9ee71bf561fffd059d +size 628008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp index fc9f4a0b34..e019ed3cd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c8883d192375444a84f11b1857da61608e3758099095466ec81771b04185b34 -size 713410 +oid sha256:56240b739e5bc66737510e52e28539236009623838713f7ec6b50c67e6ddcaae +size 696340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp index 39eabc5d25..38af5aac29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf07b501c98498b9451829c42ff23a29d4ff306f34e75ff4b089741c09b1a32a -size 617249 +oid sha256:f8131a4ac87cd98964b61c8c57580f5052e4a9bbaa3a9b5dbfd30f803d86e702 +size 605805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2796b3cf7c..95fe374506 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bb61dd317ee7bdd7f22ec244d906ee1c1ebd06fe34e408ea9fe992fd349e81b -size 862298 +oid sha256:e21dbd17adee71b8586be5ee2dadaecac9ac14e5c264728ea86bd3a7c90246a0 +size 844192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 56ea0021ac..5def3f7136 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c285d188b80c10214a89a26c10211f7548e34b50bc8bea2fd053261455033df8 -size 821398 +oid sha256:ca9677801ff21776cc27bc96aaa74903b427743cfa9f373470f8869f3374e77e +size 804428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 74402ebf14..0dc1f35a26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91ba13aa1ed0ba5f004e9edad77060219403bbf9476e009c0347065923df9cc1 -size 837576 +oid sha256:e1d5b31063b34d08eaa7c7639763ab4fb1b00e8b2d2a7a81dda61eeb0b1f8a10 +size 817348 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bc54618763..19dec3c913 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26c71b0ef0729b6e5a1511799cc7319cda3f1f6d7950f46d4beb25f3630d4fdb -size 807628 +oid sha256:7a8abb188145c3c2437259d9ca82dcd0e439bf29d1689747964f15737bb7e3d7 +size 786858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e2670c9736..6e73672ed5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15c461449b97ab6ee4bc90aa325f44a09c7a57e62b630719469e4cfbe4fcf994 -size 854088 +oid sha256:6cc327a6682d6d9ac1809028af721b3e863542701f653f99669e3a40c22be1fa +size 843926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index be884f9449..49f2e481a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75580c768e77fe8146751d5043113b2c0fd054d6b17b911c591069b6280c9243 -size 759360 +oid sha256:ab51186516fc955b8a1c8d68ed28dbfe189d7ad2a659b462106c4b7e7e7bb9ac +size 742982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 83890adeb0..096b223717 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:506ce0185d96235b6a9908a219b12869982a7cc0600dfba07d700041d4e3dd9a -size 819108 +oid sha256:33753971b6e0804028bf4cd37597e9e067495002c903da0b8b7981db87b5441d +size 811610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 98e777b4d2..b82ea3957d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58caefa5dda0cebcbef779e44f3c630bbaa52ad6a6cafefe7d95f85cfd54a246 -size 727292 +oid sha256:2796f90c4802046728aa7c43145e5dac3abf38b12d38b2e69efac3fa7e4b06ed +size 712096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 40a52e9c88..85afb389de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a047a6284d9d4c36996003d05bf1b7df142a48742b9bc780e0d2481085fb264 -size 709460 +oid sha256:ff102cd519c5e0cc218607a9c00aac875243b11305ed286b7d98ab2817976557 +size 681784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 027d1c60c9..ea5be20c07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66f2a7b52f0daf052accaf0551f02c4fedfa214940026203d09346e02e93d475 -size 611277 +oid sha256:70d4fc238b60285d144aa856b8a795e03936f3910321ba971212817cb6d2cd19 +size 593617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e88321b4cc..db3aafedd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49626291cf42222be79b7cceeeea882f2c6cefb8a55fd388d8ba5fdf011f59ba -size 733608 +oid sha256:3257c46f014f257da41c41bbc3dc98b9b5ab815dc3c2b61bd44bae4d4ff22948 +size 704402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index ed32f4b127..102108c2c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08f717444834299cadd8522752ec4a32793a0203d261bc232736d4cb9cf288d5 -size 633946 +oid sha256:660e2b6feab40b238ad13752c273d0e202ef7adea29eb288c73dfd6a44f89177 +size 613965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 40d134ccf8..9201951b51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1583a379b1e84abb2a974fef3a5299085927b5d4dbd2aa5b43eae79aaef9c6ed -size 711432 +oid sha256:61703155238bf944e54415063011361a723088735b1ccdea303b8b0205ba8b1d +size 682818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 611e6222aa..81b4c80b59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1017c9eb763073ce4871289a514797a3afb3441e28de199a4848d25dd91888f -size 611967 +oid sha256:ae1336680409bea2318c00c69e407d14e65caf9f4319ebcbdad7300c27c30699 +size 593515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b90fb8c1bf..7c35e0b216 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e47bd3d2263569bd122e4bfb87b1bbf0fa57ff00d5f6ee9ad7f5192329f20f1 -size 730152 +oid sha256:28a7b14e7d15e116dea12ab99dd6c6a85c753b164304dd98f97febdde50a7936 +size 702920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 3cfd2ce736..0189cf7d69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31095b0a7c87cf299225f9a5759f2e6ca2e87528362dcaac8db27447ee660fed -size 634192 +oid sha256:a5ecfa802c98427d6fcc850d74f0635624f95974ddec9838012325e9fafa1eee +size 615247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index fcda5e778d..a03fdf52ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:639a29d1d147196ea26f061a67e208becbc7e1f9037c96f1367de4560bf358b6 -size 777868 +oid sha256:bd52bb1f48921fa69c8718843288606e8e7ce21fed956e4af9cd58e9d9d337f3 +size 750390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index aa3c200aea..9f54208299 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29140d83b84693505416ef69a3aff43e1dd91f777325168d9d7e437cf8fac736 -size 681266 +oid sha256:8a1ff9b0139c44e53380c211965b5e816f1086d332a7e4b6804d36443f126c33 +size 662568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f962f936e6..d5414a88f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2ddc8597fd0273070e17aec1b49285d46aaa9dba1aa0ce969b2d097862bd56a -size 801818 +oid sha256:765bc4d11ce6326ab5158126c8fbc8fd643c973db41d552982dff1285ea7d7fe +size 773402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index fe4b8291ad..33ef35375c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53e3ff05b40dbb064d1fe281d5041e528ad3f9808c26afd715973b8cb2f32fa7 -size 705364 +oid sha256:218d816e3bd87b88acb989ea6130dba03c06e73daccaad7b3b52355d33afde80 +size 686370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 26a0f721cd..2ec7d62e31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0684539199be9000b25f3f9d5f6700abe267c63bdd70f58846e27de8f26bfbb -size 792392 +oid sha256:554102caa5e95e5d7a4ec671f6f19bd87bd4f02dddf8469a5a6ad99be5cee93c +size 763334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index a354a2f74d..89e2239085 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ae0353c9a3d64d8883ebecc5ca86eae99286b48be66a854bdc097b10e056f4e -size 690954 +oid sha256:9c9a2d2d1c665f18dd71d12cd004495b261bb37d853b7da20a23f3332f4c1b82 +size 671616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 5286df9278..ab4c6a87ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b94a44337559e881d532e639619301faa7de915d8139f91c36610c5502d44878 -size 817180 +oid sha256:103e7210f96e14f0f2b5f99b16970b4ae53bbb026becf66eb98eddb7499f4a2f +size 785410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 786edc7edf..6a338357f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:243b0f4ec67cedddc4ac8e123ea43ad30cafdc88a7f201874ee83616227146a1 -size 713622 +oid sha256:66e41125a6ee3b3ddcedece92887de5737248ec08e5ce5437de3c6c79f4d4f70 +size 693544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 653a09b708..acd2ce6b04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35366c7c9c28cfd60fe0896608fdc068983fc6686866f97cc6d4c2b4f9ee9133 -size 665780 +oid sha256:e9a152ec6fae69b3ced532b273c25f7ae2416c8746e12ed0e7febb84077de1bb +size 658184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6f1fa5f925..f43185df95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39c312948f313f89d10a510c03efc443f2771ce81b641328309d9eb05d062eb0 -size 623402 +oid sha256:ad8a9460bb2a2de01151828a466c8c8dab9935cfbde47ce2b42f33015b5bc231 +size 611807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9197e3bd83..ba6351ddef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7eee52b72ed3757849f2064fc8fab93e162f4cef7f7e4744a5f3247b293333e8 -size 588991 +oid sha256:c33bc3c1d8e32db8ffff87dec11b86b5a209ac2513d893dfc312440653d1fb1c +size 579963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index c14f1834c6..9b478c8c5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb674265dda3c180adf19f20c1e2a2190f683e6b6bedefffebd89de5f4575112 -size 549967 +oid sha256:242a89cb4bd8d2ef461154cd472f1bce15139de09a0a3772fa96a6583d2f320c +size 540149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9c43847a4a..8ba2d3ecad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fed50c5a4a561fcd9c5b06473b8cc36a165ffadfba83f96b24b8f3d085816e71 -size 655858 +oid sha256:306862e6fde4e2aa1ecd213f43991d1d74b17df617fa177b00324e14d719db80 +size 646288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e60ad8c294..f50a2c0ba3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cb54afe8cad58e069df9e1139e80aacd8cbf150b71325ca48fea2586f9fc15b -size 617375 +oid sha256:b858c238d1e1234646284b489b481f2540e093a556aa85f2857dc3e41ccbf688 +size 605141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index ca2d58547d..7220d54a9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c191bad88a1467798e7d65f7c41e55f6a93b92fee462be0cbeee85a159777360 -size 576307 +oid sha256:2d532e54c7a3ae2ac285d5a27f069d60a97757375072e053d3a01513c90a7afc +size 565701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 63831615c2..e203ad7c97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00cfbd957b69a5347972c07c932fb57bf4efb8b8fe7114cef34390272bc6fadf -size 542067 +oid sha256:03c7c75e8df431ebc236842482605bffb5e4221c07f816e08ec834f9f2f65533 +size 532299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0f2e0c42e5..6660d2043c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fa5a4491a7e97215f855d8f96c74d9ad822afd2f8055b6be7f0835a48991270 -size 790268 +oid sha256:4f499653ed7ce90b675e55b9ea5e54c4355b43c970fda874a450e1f21a3fbc06 +size 761408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index d5390c4934..7aca5b0d63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0493e07b944dad966380af709e5528f84f54d86c37ecf56ab570ac232137be6 -size 692974 +oid sha256:8ed66e1978c3570338a1806acddf638eea66e55972edc3448bcc0706c459bc1b +size 673094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 17482764e8..e21412e3b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:028a6f6062884c8358d0e4fb97d0a6096621de501af0439a415387de15ae1c5c -size 673112 +oid sha256:99f20fb32a32b10e64254bb6242fba3058cb2012e5173f52af01eb9bdecd3ebe +size 658904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 51ccc55dd5..f492220ac1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8824b7c2ddbc930a0451a69a4dc85b9d3ca97a1f3a8a4063d88e9a19abb121a8 -size 574485 +oid sha256:8a3ea0c153e35bf4681dc78851dffd2baa32e930f5842098b6f99abbaaeeeebe +size 565359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index acb86cc039..3ecb0eb7bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cd9bc78ed80b20ee342929e44019560a36d145d497c652ee87ea71c925248ad -size 617707 +oid sha256:496e48a59bb8e680d7bde0405e7f969370c7337279b81bb4a3810a896851d05e +size 614599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 63b6b68a82..50314e5e17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48657f1bbc47c7503de553e7e172382c0e206fef95dedd47b85440bc6d9af634 -size 533833 +oid sha256:b070877c56ad8265e34880c16b6032af084eed631d8847f229cc5044fa7978b3 +size 525001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8d0b16b30a..1029a6de1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9f7c3b64601873adae87dd3b37db7bfb57ebec7e1c4ef20cd9fb0b197be1a73 -size 812294 +oid sha256:5091e085b840932396b9cd767f727c8121a08cdb9c48178cdd92a8e88127b2fc +size 783928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 9998282335..a5e08ff1ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8816e2a61381ca599e1e424c3420c2e5ae14242e4c91262a56fdf7e7c63b70f2 -size 715198 +oid sha256:5521a1d2800a25322f9ca9de6cee1354d939afa27839510f038bcf669bb9181f +size 694084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f3d700a11d..c5b44007f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e6eef6792527ac1021d8559f5c890d98a3185bcc99a46be42c3b15ff4ab6b8d -size 607915 +oid sha256:866d32dbae3e9ce8a1354ecdef1cb90c61e7a6f75e12c1c0b393c744a23fb343 +size 595089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 28370bf61c..2eb5565540 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6e508b3042884fd1e4a3eec1fc549fd6cae2f71ce9f65b458f46fe9b188072d -size 501495 +oid sha256:21b10ef5c34887d5fd59ece112369ee13b09ed5756e4b4fa0e01babd8b727f34 +size 490939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 47ecd0faea..142493eb87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e10cbb7e390e3ea552e3d955f190bfe0cba8069a54a27607cd7423bf49ba2b75 -size 568595 +oid sha256:0c8421c75b14e09fcfffdf721377fbca70b822dff6c9dc0a884c2ecfbfcc26b6 +size 552217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 12aaa3ac92..eae3a987d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1337057dc8702a9c1ca0bbf05d78965c07ed4031ad956824059af1c152e1dea5 -size 466467 +oid sha256:246b5ffa0d19b4e65f241db911ad33d51b051d125ccf9bf3da06e82571bd4fea +size 457785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8ef4d94555..eed5a0dd44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:014c5f68d4028a3c3c071b78b774034091c627d482d70747f7c402dc11d09cdc -size 689446 +oid sha256:710a7614292bcab3979e16da0c4aa59794231a6e2f68f455b848d89a4c68d556 +size 681602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4309a67dc4..d66fde0d31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33c9f8354f349f67671830ee1c2dbdd3439ce8c050ef54a6afffbb1803778228 -size 646326 +oid sha256:72a1957695a645ccf4562bd402d67816e9ddb59fff2bfef9993417891383d82a +size 634684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 652bcb54e0..2ec1e8472f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40285e4b91bf5e5422080e423f38457a1e4b70a6ef4d82e1293d8a104484dbf9 -size 614433 +oid sha256:2d83313a0b28e34509c393204dacef9a8b40c9564eb9d6fc5047c5f5e8276f6e +size 605405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4c531dbe86..a43d643503 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad5c5f422a30f271c20b96753a1d248e4bdc96d96abafeae75cc3ef3ba668c45 -size 575457 +oid sha256:1ef3861f5c16d829d7a4c703b95782d17ab6ef99c03d16e9744617e7212bf03a +size 565739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c273fc75d6..7f763a7d6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b75a93ad032dfab752a703670ff4b00f1fc32dfdf25e330141137e7d2105025 -size 678784 +oid sha256:58316766679a1800e38bda75f573b651a608a6074d1a668764348207e5cce503 +size 669706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dfe2fd9726..c42f68e1d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4877e1fe21c5fbe1c9a1455e8f211c7779b69eeda0d486ec43dcada27dff76b6 -size 639610 +oid sha256:59855571db1fbc5e1ee0d0145c290b0b2b91ccaeb3ed767fe388a7e36ed8af23 +size 628018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index b70d2f8bcb..b6da42112e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63929617775b31291f638ac86b3495ade512c12b9aafe78d11b7f4bb64629a0c -size 601747 +oid sha256:dc7fb1e4eba00deb7aa53e827aeef51eb4246b65bcd7305a36cbea7b79e2877e +size 591191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6e42cc3757..e9c269aba6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd4c79466a652b56146a6db712aaa4a293c646225afec518207c718a37213545 -size 565979 +oid sha256:12d5c357b7b7650221dc74846344c8bc29e1b6dce9bd3306b40dfacdc7a82c9e +size 556951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ded5842a3b..f14fcded52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c7f4b3a824bbef24a14da52a3608d1b49708e81f5c25f868d6e2433b6a267f9 -size 863512 +oid sha256:9ea5064fc1100f89832b09c94db349c07aa31b3c059f18bef4f0b27e0adb9a9b +size 833814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 9293088035..61c315b0f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9d75b3b65dd6b6ed6a1eba7e59068ec7c857c5ba3d073c3b16367651d8c8c58 -size 760496 +oid sha256:076730996f4b14dd110a6879406662e895ce793e85b4ff543f503957d7373340 +size 744364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b605b8e8de..e658723152 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ace7e82dba210c7763f1e4f76c87dce20ca24fd4d86b288604f35284388ebe3d -size 694062 +oid sha256:d8ac0abfd3a1c418654a3aa03f5d495d146419b6c290a90564496d05169598d8 +size 679510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c1dfa7700e..e527bf2265 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e179b77028d21b39ae28f6ab1a19576e031db39dcfd63aa11f2eb5403a518522 -size 590207 +oid sha256:9f33b6c75769bf202470631a1153c1e2ec2379918bf44093c0ab9b44d85cb509 +size 581673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 033aa40056..602cb14c7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e174cad60bf3109d8983af4afec16287f9839e12770a5ecc8b6632fcfe2af88f -size 647144 +oid sha256:4cf6d5affe091b84a9f48f9abdeb2a161f9fe2e8b7662dc940f7984aef3b7ea2 +size 634318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ab2c9af024..c597c9ebac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:945d8dbc3bfa49159ab49ed9c48f8eda2f848536ecebe3edee152a1c5b046bba -size 549603 +oid sha256:b7a8126f342d4860f130d4c9d8100e16039c8d95ecf9aea79bfb4c125c3c0549 +size 542155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a46a7300e9..78fe1db3a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6d4e43c640064fec17e359cbfbbb73e9b2cce2dc364b3a8aae547f0f91768bc -size 889388 +oid sha256:0021ee31c2c15509e8dc065153c522da4b3f681603e0570ef059dd73bf80cf13 +size 855496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 30e7bd59f1..290b3f538d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b059c89b3b1942c7b7b55050a3a4b44be887b57e8a1eec44dc8028f45cca8dae -size 784102 +oid sha256:6b10281756fcfa9605abe71dd485fc6ba0f753348f2a3f46d78950eea5726289 +size 766440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 979aad2bde..58d423c676 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:486b9108007433c91fe61223898d3d37bd88f2b3728ff89d9fbfe5128ebd555e -size 631778 +oid sha256:4b6071baf61d5049b6821bdeb8569851281e8ea614c2b8fc73f0120ac7470fe1 +size 618162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index cf08d57452..d39a3c201d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0ed4df954dc66547ef74eeafbe2fe579c8a341e373be8d0561746550b77fded -size 525209 +oid sha256:c1e6a5a680989d3c6512b4b38152e6dae412d07917d88b233a01f7f327146ebd +size 514653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 24efed96d8..e6753c758e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bc2019b42c308c22d0bc16fa24c14a5083dce21925ebd052110533f60c89a27 -size 591273 +oid sha256:f1d782aba774e428b638f6c89f8fb05fcd7eed0730640c98c8c687c33c00b956 +size 579383 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6fbeb27f67..689dca92e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e13ade8bf9d5d5dd820f6216593d271b459c1743791f6065bfcf560e12f403ba -size 489391 +oid sha256:3faff687d36b4c177bc57321a320303cbde181d21b196dda56276b1df53fa419 +size 479919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0b21524f1a..a89594dbe3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a90e14c3f899306e2bcada888793a635b1376799e9010df6910e072f925deb99 -size 673990 +oid sha256:c222ce7e6b192fe36db1fb11ebd705f114b0a5da03e7d8c53a7b6f0ba06f5410 +size 657610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 158f553e91..f9967c77f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e5144748d9d55af3d64a309f8d96b040ca6dbba37cfcede50a9f35f0531c428 -size 582221 +oid sha256:c9c1adadae9f43f251386184a18fb24c4148b39672b9c8d9987e097f02fe5cf6 +size 573489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 32dff82c69..0cddb7aa0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1d97beed679b197c3d2b4df7522c434e45e3926b86473d3f50e9b07e6ab065d -size 689406 +oid sha256:7dacfc8a123e1ae1e3008ee34c5474c8c9783ae03274767d9efe5db1a29b81a2 +size 677220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 3475e935a7..8404294932 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:460b0f408b3a225385e1faa620806ee16df927a9069fd1f26d2f934f4c2b2476 -size 600153 +oid sha256:c275e1f43ccfe5dc0251a133447841c96a4443e2e9915a46b6e78582b882e949 +size 589989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 67e1518836..4d645a251a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80e4306ed169e147bbefbf43947c86b9daa1fc3b8ebb26654ccc9347ac99575c -size 671224 +oid sha256:b9f17adf5295a9f709029771f91bc5599a08fa20f38621b1a8ae0e21fd9872f7 +size 659138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index d8cbdd52b8..b1c771987e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2eaa2b148b881765f950b0e40c17a2b3ffcdd154de6253ebc74b5263ae1b92e -size 585771 +oid sha256:677ed768db2c16c3dc8d279d8c362cba49b3509d1712f9423c5504f3fbb58f4e +size 573931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 405a616a7b..05ddba0d3e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7538c107f54f13ea4bc019da868c876d4f38f1b04bebd09c9ae765094d940abb -size 688564 +oid sha256:5fe73d00556675c401e108e6e02482acf91b1148f05bc682c7e44a4660c00305 +size 676478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 5d9a4c4748..bc47087e68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d1d7e27da7a20a569db88f7a687b1bd0bf94f477259a048b0bcc180b1b03f13 -size 602913 +oid sha256:8244cf4055ccc876e3e0b6606c1e3f95c153b162caf917740dcb74dfb13db769 +size 591171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5a5f69dd6b..33757a303a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a510096975e01f83bc270394fc8b6d7c9c2843b0abcdda52e074a6de61d1001 -size 739486 +oid sha256:819c0f3d375fd6d49ad26caa74c870959ba24f8c319db764b1b0ea5f375abe50 +size 727400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 37883540b2..04b230039e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7b13e171304b9cec4deb18a217891a233f5c26d907b76d6151086e176773794 -size 652110 +oid sha256:e05d0209aaa4e3c1674c48e85b27b4bc71c998155bf0421d18faeed77caa4d8c +size 642638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 597e8b3f50..b48d576a15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:629bbe03f52feb11d2d23134cc6282dc4ae1e271a4f6359ee829d05439d78cac -size 757666 +oid sha256:943998528165e348669c9030d9ad5ad4647c33e094940862f1d795d4967ecbe5 +size 745530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 071b03031f..6e88cae571 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f82e0b30a68049793e3db61bc28b54259e06699185f2b3f0d6d046eb24f9bf2c -size 670288 +oid sha256:a686c0af3c1180b22c28d3fbbb74aa4933ba12516e2f1f81d6337c66d9e5032b +size 660076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1e21c72909..796b0e4dc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be55e13d6498a4ff3261ca978269ee9477ae0966dba6807289b80d03fd3107ff -size 761508 +oid sha256:d7383b2d4db0156192bfc6ef1e48f4032ca7cd0730c69e4c1ee2453e2c5ef423 +size 746116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c5ec9ae1c7..fccee0481f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49a2c3068876ddbdb4dd90cc7fce6b80be8001ced0a373ed0ac1ddb946b2f649 -size 671764 +oid sha256:4f82e67c14057f37a946c5447f782cb7d35c66739b89f21acf07f56eadbe34e4 +size 660960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index defbccbbf3..dfe1150aa7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f91d37f8eef4577aa44fff22c317741f013cbf2b9b63c8e3ce1305ef0236e73 -size 778060 +oid sha256:1042a105b107799aa28e9c086367ee007eb61f0fcb0f92fad133d4e4449f81f3 +size 763160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index d1cbe92f3a..7cc6c73f64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ef4986ad977ebe33eddcccccaa1e7a9e0d8566315703ab0d4be2ed51ecf2aac -size 688856 +oid sha256:b0104a640fcada77aa9a12b0fe4ee49cddade6961a1855d27b0ce5bcf9195f19 +size 678052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 002648c03b..38ca23428b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6200d1c28b1b6a823f43ce8adf0d43d36ee02cdafe2c54ad1a741a7cdadd4bde -size 766668 +oid sha256:ba2f22283d9c3dadb728b5f7e731ac2c6fe63b39882fc03d24fedc6129f62bcf +size 758182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4668751140..78af17bee4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:650ff9b518b898ed694ac34910870bc0b03fce0fbf1451748b780903897be282 -size 726360 +oid sha256:5c9089b9d57b8f9838e5709c0aabc88c0c4afc4b1ba9642c34c42b8166b4e294 +size 716690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4b667b2afe..dc7e7bf7ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c26375906206cb17ef6fd1c5dc3ac23b8ab69fdbe48fff2918e6474b73f49ada -size 647206 +oid sha256:8bba112006d0c5e56bd08a9163c9919dad6634e4a296b0dd64583a9ba95fe14d +size 636896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index cea4482c5c..068002a745 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f218d2a4b18d6ef8f6f161f43a65bcd9ade689374dbc5e39c8f4609a66ee74c -size 611683 +oid sha256:cf8ca8f084e46fdc64859e7e3347933e83123e36b40b7d9d1f1ef13ecc8fd0da +size 603149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b9b3488a1a..0430a14cff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8125f7db1852cdcfc350b5fc24b0767b298d2fbbcb870ddaedf31f8386a7a6de -size 750282 +oid sha256:a95d7a4060e13c9430c2fdaf85942dfcfe43a5bef8bb70517c1daded5aa65d12 +size 740662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2383dc41cc..3f5cabb644 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c482c87edf9a5e9abe29b33c908792625f2ddb4d63bd37250e03c0a66896b682 -size 716488 +oid sha256:a64c77bfeef947178fe5de884e34d067a6edd4544089eafbe3f21f087ef28e29 +size 706176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index d502065819..904b915336 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ddeabe7c618928238ffe99de34c82b7c3ed801de34b2feb586ca75ebfa55a1f8 -size 630278 +oid sha256:456fc55cfb80c910e85b973d44e6c7f7e8b3d28f3a2f67095e0b85660f4d050d +size 619524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 432e3f8b49..bb8b572638 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f41917ad263315e35d74fb830551cd943bd11c8688523ced379a72d87de7e377 -size 600379 +oid sha256:130eb6210eaa24b7cf20219a1b503c541fcce6734c35389fe8604a51707c864b +size 590661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index db5c5e8f64..da945cca8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0d14460c7ed790b4904662ce43cd7533c63b8647897f9432fc9380d79982588 -size 760816 +oid sha256:9126a45522d4599de1fcbe9be7373f5b2717cefbca719e50d45cb0da7f7bcac8 +size 745276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index cdbc9e9b4c..84c5e6a573 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9db4d74781206061adc00fde3e389457dc66526d467946b3e914253344b521ed -size 675954 +oid sha256:7cb5184de7e0fc131180b3a02e61e3014e6511b0795ed65b9d9a049694e6beab +size 661006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f835ca34e2..788c66a220 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccdef537ccd14075bc3d1a54b532ac227d727d1de451de61eb96bb3a4a4349cf -size 784506 +oid sha256:b6091d2ec99bfa386e1667de3210c3bff5fa0b5ade76b8101dee61f963679439 +size 771384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index de334a0263..a88cd18979 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b91a8f07f81a41c06b8bd674bfc8be632bfbf833d773f281855b80a105858b29 -size 686028 +oid sha256:6a1fd440f7a9ec0b192d616b16b9dfd5c49dc2a639a899b7aa4bae6280effd31 +size 677396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a6af7a3ffa..f49ce7b83e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:978342e6a7f6eeec71a1227f0cd4af8ea525a87348c9965809bc0690316b000d -size 719138 +oid sha256:82336f565db9d1a964071bec39b4de18e3989fcdd531d69e3cf5205d40b158b2 +size 717608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eaf9874763..65d93963a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d7618ed8ce8c97d14f71f701b9c2a028d33f6671e1c43bfb154541f0bbd89a4 -size 636002 +oid sha256:203b3fcca16001e033b1de8135fa1dccc32516e6975091bf07e3b1fbd3891c35 +size 627666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a4b6c3f031..832eec3805 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6ae203638b36c0e61fe23f129ac494246d4cc3516194321086baabba6a19ae1 -size 782004 +oid sha256:8eb7c447c27342e08ff008200d2489ec4c4a2f6a9b4d40258f94966e824d5633 +size 763306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index e9daa39eca..24ca84abb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4631ebbee9118d496b9e21040f7994b067f18fa1cf80e5c803808954769d3483 -size 693886 +oid sha256:378183f755cd05236c5a432e523a57a7a22884f8365fa2790bd813ac6bed3447 +size 679086 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 94d436928b..08adc9966c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ebddf158eff7258bcea3e3b9b2f6db3bd262f467dc0a2212bff59f443489018 -size 674714 +oid sha256:fe1baca92d19b2293323b55378cc9ae35f20aa624e50da31a5dd2085d2a26b31 +size 661098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index a917927093..5b10b51a50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9eb57068ce4e271a411eb406475cede0c76158db33f554fe62c351d2a086fc39 -size 568885 +oid sha256:9f934c970916db2091cbea87733c058e7854ee074c0b82894bd93db35c6e9cde +size 557045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4913c914e6..2fdfb2b676 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfaa39c1f1731c20c68e27b646e64b306fef882151af53970c6a85cd56ae9fc6 -size 627302 +oid sha256:1ef2dc75442ce6163eb0724e64454cbe2840cc24a254489511b7a163ec435adf +size 609837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index e11b7a6dfe..14414d03f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:437dfe7541d6c622e2036ae8c39235be329b1a3836c9bf061f832b434cbbb056 -size 525025 +oid sha256:1725f3d6dce3de041d6d9161551f47f1d5325ee034f2e15a264182314a25c727 +size 515109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5b0e8786f3..f839fb8760 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2323ed83fd572f68712009e8c4165717f90e2c97f9e95e091351fd112aaa1f2 -size 790726 +oid sha256:febd8add478e62008b3053373467d3fac41ef60b4577bd644bd31b30e818813e +size 782734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2eaff4f168..62cca20e40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef52a629431d27ffbaa976bfc6db3bbdfdce240f6730a14e1a5c6ce645422e00 -size 750320 +oid sha256:65fcaffa4f8715efe226ad1de91b8ee177d820c3dbbad37443302595def9656d +size 740454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1dae766af9..1efda1d8e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7987322f77764ab070fda4f858c5e09575dbd95fe0e1654dc301f2411231e064 -size 673436 +oid sha256:ab011ba4e6141666b0c62d1b231e1ccd8a2531fc2af4ef1f7674539cc6382598 +size 663126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3a0b4cfd00..d642e2124e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca2075c8734df35651f0d5139b3671fc6863c5e1026b33e43317b7af56983a6e -size 637124 +oid sha256:3a84c5351bcc501dcc213bfce6f78480ca34fee3d4e8647f9f0907410ca63b35 +size 627850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b31069f410..007bd358a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b461d4ce7bb3a5c42b584ae42678365e6d3b18fc2e2609a4254c7b284bbead72 -size 774292 +oid sha256:0c9eaee4ce3373a9d419411c73029404821545bc2917d5eaa4b543e8fc124289 +size 765216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4181660fb0..a09a6ed6b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27d2934a4000c1cd318334ded0fe3646806523032e143a91db9c1e6d6a94e813 -size 739658 +oid sha256:cc180372e1fc42968c14d881584e869369c1925163e0f5cd056247e23c34a837 +size 729940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2aa54e32cd..d6627d2097 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f0eac1f3e99dc1ccaa46eb61ef0eeb4043ed4e7f01b6a4841c1b6dc973788ec -size 655720 +oid sha256:2333e4c47942da07b5715b8cf5770c7806a349891543a75dd7bd1e63caa5847d +size 645754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 849c6d626d..1d0a68d0d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3422967398f2b5cb3cd27b8477e78d6b32d457493730ba07702419428623bfe3 -size 626018 +oid sha256:ff556ac92a50d888c501d0b18b6e169436b3bbc78266f5c3ab5d033a3dcfbd4d +size 615313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0573387c31..5ab1bd120e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ac5806463a55ba2cceb9d627d8e291dc0f95abc4c9c3e92aa09a6ce5ed5e56b -size 828782 +oid sha256:98c1c0e98144c92baa386a8aac4940480c3b3ff61389d431b1b399d73fffdaca +size 813044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f78d112f57..a33c82dd0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56d481dfb89d9f64345193e55835e377808538e9f5f28b5cd8be88a84dcaaaec -size 741750 +oid sha256:69756014a38cc658eb821a38cf83dee172190cf8671e2f14da9368c0dccd7c42 +size 731982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d29871c039..6480a0c9b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ad38cde859162d812b7bae84f8a3e878628f4fd0c11b3cc36e16a29c6f07019 -size 806246 +oid sha256:b5735478f0b99222df7d557eef80b178f133ea9aa2022ffab953f6f03dad88ff +size 791250 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6aecc75c33..9be0b962c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1825d5820119ec52a428bbdb67999cd6156cea04bd75b5595614bbb80c4b6d3 -size 696522 +oid sha256:ae07aee3a58b91799dd5acb8abad96e04ce8b15849477c779810767b76fb53db +size 687986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 67a6b2cfa6..c7aa75c66e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0b1e7219c00be212519cc37a59ef7b729253a7a2ab63f6811cf55aa9a2a7059 -size 748574 +oid sha256:a25b46d0092904bc99c2a53da74c463ee97e9fca81a7f762da1f20fb83dcc9bb +size 736536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8f58a853b9..3b0b5ffd5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73852b150b51c65b01deb8fc29798386b34c77772580d89f6f5a21e5a74767df -size 647284 +oid sha256:6a03cbd77f6c5506c8bfb38a6f3a2cf0b2042437d95b3449130fcf46c5d541a4 +size 639046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4a923f8dc9..56449bea9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fb7faffa1fc8549ce1194cad3a2cd72ce064a7dec4f41e48bfaa8ab15089fcc -size 847010 +oid sha256:f1f581a1d40e0b382c09b538272a9f0368d3c00b4ce00fa88ec2c3d820326c44 +size 831076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index b0a25df36a..eda633b140 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b3d61377fa169fffcdb0cd6bb95728ec2187db0aa52abdcb1e89f40646a6c37 -size 760816 +oid sha256:d41344fb5f8353d3dc2e55940cfca49317b7f2e10e5e63f27431c56543ab0854 +size 749272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f1e303754c..96083ce308 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ad322c248b12b873d6e8b71ee99b9761231f784c25723c4a0657b432b285bd4 -size 699020 +oid sha256:5f3073c015734464d3eee0b7b470ef68c6c0584f003ba4c1bc5001fb285e4d91 +size 685404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 44db996346..cdbc68b6d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:739fd051fb001097475694e4f75a0c7b54a4bb03527eecd3feaabc2e56b527da -size 590429 +oid sha256:a1972c68ff93bd31b9d695d08af9aa3cbaa1df348d59ec21528719e06e8c41f7 +size 579081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 67eea7c030..3765c35a9e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b41823b2cdd104ea730d7aeedb690996deda3c341b5abd59851f6e0c5d67b0e -size 649586 +oid sha256:087751329760a6010bba152362454a414c87c77341c2240ab62da6d9e1474a32 +size 637696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8e57afa01e..5f67a968b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76dd36f7b8f6bb0feb3162f955907edb3484e370d4d32a9af7b143b39783eeb0 -size 544891 +oid sha256:ec51ee610e06202297f721bd0e1bb69924af24a5dea29698b6c6d5dfae1c25fa +size 535519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c4075d6257..70b6a1fbaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:152ac2a97e33c5aa3aabeb44071f51d12bb6aa1405d94f46c97806ed6f19e31e -size 708866 +oid sha256:f10b5e2d97b568af0fcbd39b622e8607323b79653d5ae683451cee5c11ac7494 +size 687456 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9d40004695..f9803d28cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9fed053ead1666ee9b1529e276d0156789fa6047ae1008f45ccbb534b4ac4890 -size 570723 +oid sha256:66f95b5d8661f70da0d9e630da7813fca58c27d1d6ea579506e3e215e16a0060 +size 560907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 17a9e5a1c3..bb8e0428ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad80efe9a3686c53f3d367fca7325ede6d95633e46ce2dba70d74892c4b02c2a -size 735480 +oid sha256:89e741356b4276eb7e0728c4a8b5812e380cbcfb682b3cdef281332060bae9f4 +size 711752 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 7a1dd485eb..68a51152e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a887894db49ad45113c3362156640cec962fc4555f5aee7329b7a4d63d0ea89 -size 594773 +oid sha256:7cb7040868dc0fba8f81d60e8b7ac3f7262f33a1252cc89f3d80280e0087917a +size 584215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index aba2a8e108..b6ce7e224b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cafc4674e41e1c6b80e38e848f38db3905f0e426ceebda9f37a0178dbff92af6 -size 715426 +oid sha256:9dd4455a037138cc4f81e611a55276d0869944f843c99e8a8a64e45f7d17290e +size 694262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 092c149d61..9a15f322ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7013932c372ab65b4598036809a1ab2455ab6dfe7305be2001d143ba9d258f9 -size 571313 +oid sha256:9e3c8742d433a5a08153fdfed445efc90d0e78c4ea585dcd53c66780a73e58d1 +size 561497 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index bea3fedd73..2cb471edc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7e42c61022dddd0936dffac8b47e60986746ef0bc769f42c56086e2c379e305 -size 741448 +oid sha256:503d18afaa9f0591645f224e4a012db5838088a0b3a1df2d382728352887f3a5 +size 717274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 7be2d6367f..1b9cca667e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f098342d65b76006d1f73213435949de778fcbd95b757b8bbc7d689239ca07ce -size 595117 +oid sha256:5b91d3d94e5e4561807d8f269e25169969499f4b2d6f53dfe84984075a8fd850 +size 584805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3211d9f73e..32e991eba2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6051f24710c444aed0f854cb13fd204bcdeab8cbd68c7b9ce0aca06910c0968 -size 778852 +oid sha256:304eb257fa36106949d3ab686857b2523085e86976b448e4815149a702732713 +size 757640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3b74813136..cbfb204d58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:269d69669e5449a0058516448fbaf5e1c28e39faae20d4cdeb44e83a94e8cbd6 -size 639824 +oid sha256:4b66c5d4c3a87005f9e1eeb00f72faa5c6580bbc4d67609518b278caf32cbd92 +size 629760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 911fa8f994..e71ba4e135 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c24b7aecb3cda09034df2f8af52b84f9f8655c8dfd3df500a18bcd214f4320f0 -size 804480 +oid sha256:fa1db66fc205e4250a9986a51b6689e9184f804e7fc150184ad5553f4b67d40b +size 781540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 5035ea7bfc..69e35e87d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4ec85077fad0540016fb32ea9c00220c30769e18c342680d178399b7b675b54 -size 665994 +oid sha256:970ead27a51ed00ae18b479b97793811e7e65f39dcc165decc55a1ed2f4b26df +size 655634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d6d6d8a9b6..956044360d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99b72493e9044f200a58af07f580b2cc1b93bab189aedc5b6ea4a9e97e3d40c4 -size 794018 +oid sha256:d65fe7833a41d3e62f6e2820960f3d8e1f7f6fa0270edbbfb31a7fb6f83e2956 +size 764418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b4f9d5f9af..52d04dabb3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae42e08400a304ae9b7ec53252ae589cb52aa3885c9f6489ae41899a4207cde5 -size 650498 +oid sha256:d6f797b650ef9cfe7de72a459a8dba79c7710afd1f20b58665e9695ee7e052d4 +size 639794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 9c9da206cb..71a264bde9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a8c3b2137ce0f36b291d094d8aa6c577b7fa7f6833c1abb87e292b8b8a8770c -size 819694 +oid sha256:842698f5226ff046746b154832ca004941257038d2576f3cc4115cffefb9b4dd +size 788318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index ac90a7977f..4f64245cea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0409f400fa6a34a66fdfa73fceb01e3b7bb849674473760248d44fffe574c7fc -size 674548 +oid sha256:04ee6c3fb5b9b25abded77a29bd98a0d482ab7d8a673ccf1fe49c7069b73e87c +size 663102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1fb318a223..e9d68e7f87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33759fd8d44f91469d8daa8dec93acb6ca4d3ec895d5bb2fa85531fb0f86c080 -size 645108 +oid sha256:ff0071ca1bf37e0ca4df0276ec82d4ff3ea9e91c5e77deb65508306f8e97ca49 +size 634354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9de4dd3c7b..81cfd63d78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e680afb56c165d886d9ae01f0c0d34c8a249ca2ac42d53e9599eb16e600dab4 -size 623004 +oid sha256:ea8264518189304bd727f24427c00dac58b5e7c687eff7343c5881e39b2102be +size 611657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index a48f468bfe..ae3f0b9228 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:867ba148a1bcf1b17a3fbab4e01e30983a7229de6c1a80be397b3d7244e99790 -size 576261 +oid sha256:a8cb22058905366b0646467cebae1930a562827d56f50e843ee05041b7861dba +size 565359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 43aa477202..73274c838f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ffb0d5af0f6ae7d429997e9b2a4e24a78c60da15c38f95965e9b2da95d245b8 -size 555589 +oid sha256:d2ddabd1b68469fc3d45735b33bb83c448fec3d3916d0875c5a73c59ec758722 +size 544933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b8c89fef96..bcf670226a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db77e532fce476e041ba7b7d543cfb633f6dc6d835b80acf2d3fd2852d124f5b -size 637160 +oid sha256:9089c1b4593ace1e74d3c81ef4530e76a8c95d23003f9808843b1c9d5d3c301c +size 627638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2bfce37a86..73319cdf55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41f6a169e324449428729fa2b932ac4e37afac4bc8417ac6b48662b4563457bf -size 616387 +oid sha256:acb95e23d15518b8a49b3797f1bc427786833a58dda96378634d98414b963e90 +size 604991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2d0d2e47dd..e621c76343 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c5c60f6d8fd9a62b04dd7c09008281c9778eb7138efa2dc258a6c2791dafff1 -size 566537 +oid sha256:043d52e31d35f3ba1df83e42288eb47e71be05045f4340302457cb34cb8ba569 +size 555881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 75786e27e6..c4a909ccaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aaeeed4aeb9898d07e0d5631b5f0487701f26995039a21f8f5e67dc74e93276e -size 546949 +oid sha256:fd11587cea509272c1c5b001a59eebf9ea2801951c62b650da21bbd3cfcd45ea +size 536245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 28eb02d1fb..6692de2448 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19eececaf4463a9396d8d8384cafb292f4297d3f7a337e8e86ee23c821fa81c0 -size 799146 +oid sha256:ced90c84d183fa0c4d5c15b28617e54cb2c6df63c08a1ddb71d615f0fdf3f550 +size 769892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2d5a96b21f..e7e7438afc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ade78e85d74c37727e0dff251bb6f6e554391adb281775d7d2188bd228eba0c -size 652322 +oid sha256:556b1a5bcce75f13c9e6ecb9fe4220cbbc93b0ac7d9c43ba8ffa40362f4fa83b +size 641124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7561db5b76..7ad49c4fe7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:024c8afaaba5a2417186925b2c18874f2ca8d0b5b70ab5ff55bbcdd817a3704c -size 652586 +oid sha256:6fefaf48e3e3cf8c332a2428f2b89f2125eec005fff360329e381ec7af4568d8 +size 639168 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7e5d0795aa..a134819702 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0d31f350108c5f0a786c372848d7540e1901819f1d84b95a35ec3e3b4814179 -size 559585 +oid sha256:cd58787ad1481ad2cb103f63daa9f8feeb971035e828c57c5bfe17159da24036 +size 550507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f9784850c8..1925a2bee3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff47121a1d20f8a057b58640c8bd66bc4e8d4dcdc00f349b28776d1d10e72df1 -size 616669 +oid sha256:1ee2545bf006428d7bbef20a4ab3c05e0cb65d7e7e267e3f27af4fc0dad9f3a1 +size 613413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 68003943ec..0a287257f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d620fc3d27163bd025953cac5c121e0d18978d41fa03c7ca4d60d17c51b48d1 -size 533535 +oid sha256:1b596f745d96831d1ca1cbef440ffec91deed0773efefb60e7bb7600e0f54bc3 +size 524753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 64629c68c6..a2e36e86d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7be4072957217723c6175345d9f5beb158a2620c4183f757d8a89923bc5f57e -size 825168 +oid sha256:890ba52d1946c117929f350908acd5255dd5fd21d8279f76886a7013c9d872ec +size 795716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 950c511d7d..9a0b0e4009 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c12bc254171be17b3592046998c0d81c7179ff389f1bdc0f5980859225eb863 -size 675336 +oid sha256:883ba0740651d6815f2cedd8ae87cf4a1ca0402c2737b1c8d9666784da356361 +size 664432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2bcd6f3f63..076ebe4e32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ab12a0801ddbe5b1fd5acb780c1f9d04e8d47fdbb3cd59c8ff0c5d384e53a3d -size 598343 +oid sha256:0307b7ff1eddf3413d204774837994b2c6f28baf068b527a9656edcc217c5f2c +size 576093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6b5987d67e..de7a5f6545 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:682bc49c0d01c99c619d284abf7b4deec5e93a0c04814e16871854fe39ee9c86 -size 494685 +oid sha256:b512d65d4c7acc33953a23167295feca7b1970ade9832c489e76e0ca83f1031c +size 483191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0c0aefc5f2..620cbc562a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28fc440a05f4ce0f684bea04d8def6832c634a94006d976429074b6c3e81ba02 -size 571751 +oid sha256:7a8cd422bfd01f68d524a2943028b83967f107f12f12734baab24328a3a5f909 +size 554187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8b9a4b0300..476bbaf0d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5862ab8ff02279d384b03bdf9aad10773c8de1351ca3ddbc700baf4995f43e9 -size 471991 +oid sha256:a2db60a1347fe347238360c2b38ab2db160748dcc1a3c106cf0ee903de191838 +size 460939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6d1751c011..754fde2b5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6305ecfc079a08581ac8bbef0fc7b48dcfea92866f7775d07badfd473c57e7e2 -size 668822 +oid sha256:9fd0b47e3ca1545b9f71f9015cbd8c0ac99422d4707f2c60746d30cdc46bea1f +size 657772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b51045d96e..1009a86436 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa28f022371e23e830c833eaabc19d5024dca7bdf1cb7212fdbbe184b44cc69d -size 645140 +oid sha256:8a5fb7959829f12fe880c696b8b674ac5dc0c16fa7622252842096995000b407 +size 635076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index c08f381af3..3a8e4cc666 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f58db6e1a226e4d2019c435844b5ec89452929a17d24ca3c06d2147aa0c0e613 -size 601751 +oid sha256:476d7c1d3bc6cbb6b023b20c2b77435496d23344b2a7357ffd55019967767db9 +size 591539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index ba290c2604..a8874c3f7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a8fe6d698ef5c84e8c7d87a5b08c30b148b2983d96e384a3b72e1bb1da7460b -size 580141 +oid sha256:bf6eb9edc839e3852ad19ae846e191c45b67b19972574da64b0cf01e235b61d8 +size 569683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0b547d852d..d06f8d8c3e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87e252406d32c2f8acf00014174f63428bdb9cefa8d61b0fa329888f2aafc8e3 -size 660874 +oid sha256:fc6ff82985781b61a5b26a4d45d4a3cd6a871ac811756734ea3299cf6b1c1d8c +size 650316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b0cc8538bf..418461ea1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e67d306fc4f3778cd3db79123b0708273d38afc23b20f08bd8c85fb451b510a -size 638424 +oid sha256:88cdc83f9d46a93e388a210e1f73e2c6b318b7db34a463e7d8799465c5c95fc3 +size 626832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 17555916c9..259382a52b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8cfda197c44f0b907e99cbea1f45150bf60dcc6b8441e9f5680a5f91eb37bad -size 592027 +oid sha256:b7885f5be491ba1ed87cfbddda40d9b68cbf00e9945686efb00e71872eff8650 +size 581371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2b5641c2b4..7820baf403 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:662a4b86ae9b50147338ed5ed363a1bfdb5ad0282bd96d300029f15eb66cdab6 -size 571601 +oid sha256:47146e36d83f5976fd2a7ff0e252f5fb80eda251c4a230261defc0b040060cdf +size 560107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 45e303a533..93ddd41dbd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eca09e562805fce80c8c5364632793346a2776060779556ff329661ee2b945d3 -size 865928 +oid sha256:7cdcb48a7dab1200386d9fa7415ca8d8b826e0160484ed4b6da501054bb81e56 +size 840818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0f8f327b2e..abe34307cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3cdae4eb9401bc5b4dfa448c11a376ac6566006477b7bf7ae49e42a32f4551a7 -size 720682 +oid sha256:d04927ff3a073a7b463f692ed63b772b8f4731cd8adfa837ba432a4f87b499eb +size 711754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0af92b8b63..f39a94e653 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4f14fc1ee33f3749d041c2f249ab3eac86eebf0e4eff09b508a36adf9799034 -size 672748 +oid sha256:fae49efc79ad13ebdb4c209a1d3235ba02d480d2a88e34954ff2efb5e74a63aa +size 659824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d8e1a9be93..11b79e100e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbd05e8c60f3b2ff587184fcb5644a1b90ac4582d0172e348d71e40cf54f903a -size 575355 +oid sha256:ab4c6dda8e6978ae37111686dd36cc533ffb716b778394a37f73a3efc5213f6f +size 567611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 05b59c709b..7c5222aea3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:490c54a839225b41d22411d61b879a4467c3fd906b57f5b02d2add0101b4c155 -size 645170 +oid sha256:f374761c6906157559ddbb90d0a6c03718d1d66f20f8ed2194952d3721f78e6c +size 633132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f780c0c618..7dd82c2fc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3f4dfc30a102f239c9eea308334abad148e537c02e2fb81f517b50c7431f880 -size 548467 +oid sha256:e678e24dfd02e3c699281731d9f4fd8ff4e23dbcd475dca0d8c819f5e7aecded +size 541857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 946454cd7c..5f36fa7381 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd55112d97de2934f804414503288db75598fc84a67935bc8f4f0a70f4ad8404 -size 891902 +oid sha256:cd4524d8eb12ed6ba49a53e5805764d3332f8e9e05133602d0105b91b80135e6 +size 864274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index e57cfc352a..0a9fcfce30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:372c4d79c06f117a6868c1e01b52ad13199a4a27dafd0087f180906f52c6b46b -size 746804 +oid sha256:bd522fbd6cf698d3a25ec44adf388dd73f9a5893871bd8f13b6c32e2492f52c4 +size 735556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 19d9493bf5..eea999aec4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d5e20c1c2a7bd46983bec8859b0d14e64ec1f681395f59892fa8c978d910368 -size 622996 +oid sha256:047786a0f7af225c6713b76e706caace4d182ec81ae1e9fb7e2daee36b2590bc +size 607701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 149dd2686f..7581a78a76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0f9745f271af9aa344a3d9430b5269e0d83fe1d91074ecf5e106515719a0b43 -size 518399 +oid sha256:4f7b3ef44cabc5e258b24c96b7d1497715db33d5d5c225fc938dabe870c77aa5 +size 507053 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 42dd782760..f074e438f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b24d77c3af0999968d084d9a28d4df707e9ed34983f10aae3d906c70acabf6e -size 594527 +oid sha256:1516053f0b0b30e25ca7b2fbefa6f714c1216bd42accde838a1a313660c852d7 +size 581799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6da735ff1b..c5bda2d842 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c273ef202efd0077c78d11fc63f3d825a1e36d61e0859266b5f5aed8ad99668 -size 493337 +oid sha256:63bd9170cfa4d3d761a2759e4cbe0d51b630342e1b1d4062b802540d3187b69a +size 483075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 48bef03abd..967d195bdc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:682896b82ba7185b22d1a6e80fb231829c0001297e6b9a00660d20f402445ae0 -size 722286 +oid sha256:4c85c88bf47fe3b2637f20e2b831e09f81fc9d6712110c29d5dcdf25472085f1 +size 698854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index cb16f98667..db899bb551 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9e3198e94d76763f8c4dbbaf363729c443a3dcce4f47c8e9d699629854f054f -size 638610 +oid sha256:8f93c40434dcd0c1ff8dc08cc365ec08792218656cb0f3f63618078d7158a929 +size 616557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e32cbba26c..a79e36a177 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbd84bd55165e234a659a935a73a1460989cdd692e8a623a6c4dd143cfd92ae0 -size 746434 +oid sha256:bf4e4757401e1414eb8647acede45ca6f41dab8e555f19990e4ebd59f035cafe +size 721472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index e28f4d0000..4165b9f023 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffbd7a612947da9afd21f6824969a29b737939d04243b4948f55ccbc78eff318 -size 660488 +oid sha256:694289f51c1ad33b61230ecf1e8780cdaf1dc765d40deb058073a07c2930de50 +size 637696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 0f650a4c73..45036aa24d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4aee3dd1cbe0bb41c536c41be943b7422d453ed5cd58425341b6546ed93427ed -size 728944 +oid sha256:595befff2323d68ad2d3561e2a2dbeed7aa663579713f07e89fa5ece0cce2784 +size 700628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index acef8ac194..d049301b4d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a31efa2211ff4e5a90ed313e134f0c2985cd2df40d3e9905e8e0016f17c7d38 -size 638508 +oid sha256:84122e4c185ce1e890636d9e82dd38840084da1ced704aa7c4e88fff5ce7035e +size 617245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 0496bf7e0f..0b8c357c6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8469a53a6584974f05dbebb9346c23b83d465cc405c3d6721c812e751dd3cac2 -size 745890 +oid sha256:dc91fa48b27bae916743c382051fe081967d6ca2b3c8c1d338c871972831f713 +size 719990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c7022cbf52..bb3f0be000 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca38cea0b17d210c42c119cbd42c0ca67caee02b42392ba90dbbcdefa070d03a -size 661522 +oid sha256:b9d34f8d71c6504b8819b4828a4df1912eb365b4e40b7e5ee9b6cfc91f5242e9 +size 638188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5f45f35223..809ebf5378 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d13635a1d818f0859281c2f96f237849bf1d87e912beb64d4448600d062882e3 -size 790646 +oid sha256:2ecfcd27d2f4edf1004abede949142f8f0f1f34e908a1c7dabc1718b545e4588 +size 767410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f8de22e870..9cec696f47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f8a86d8822ae3112f4986a4024d4b4b6e5007b7e7c26cbaf0be5561fe8ebf2e -size 707806 +oid sha256:b25f3aab4a8421894ccef5c32b4852eacfa18e6beb9579497cc08841e601789b +size 685558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index cf85786467..e0ddb48077 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c2af8c8213d91276b31e86f5530c6a48a6d4325496ed295694f4d381fb2446b -size 815386 +oid sha256:0362e8eb179d9836978a176425fed2fab6497d3385ef455a4adbea8c570cb9ef +size 790422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 0708e9f8a0..a59f5e4cc9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f18c31679ae15d3fc611a647f87207d358d7fd881dfded3e4124e3ab1fda893b -size 731906 +oid sha256:02dd91b27a4ec71eecbd008dd4a981037d3a032abfecc21c52a2d4d6f7b50567 +size 709310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index b446e1366c..7537ce55a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b90f4bd83c0127ed34226ec5618dde9290ffa482145fcf8796e5d8ca1bf7838 -size 835096 +oid sha256:9f152759c7bae14c29f0f6196588c19aec87821062eed8962823d7ccac1318d9 +size 826068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9a11d1f460..f78457e8d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6257f4817559cad77526cc171b151d035ba5c0443e15dd89aeb6cfb8e297438e -size 665240 +oid sha256:e8bcdf6388faef4394a8f54a880c6cbe83685c52bae2a52fd40326120ff2f443 +size 654634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index eb3924e205..e279878aaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74ddbc0b8463888ca5ba23b52d135db129a0c8ed7146a0ff4e7b4e68a2eead32 -size 735786 +oid sha256:045c26391ac5658d9dc7ac82ffded1d9c5c91496bdecc186407e2ba826f191d3 +size 725230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 3437ee07f8..26f313a582 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df83c5aeb57067f04632e58fd592d7be7687e63bb8cf7972261ac4ac8f4511ee -size 737858 +oid sha256:47ef55d5aca2b22d2b7142cebec58d42bb39ddc7cd5ebb1a5f0451ad14bc19f5 +size 728930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f4d3153f6c..ad70d33300 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de599171bfc0712b2fedd7c1c44c4db6cd3883cbb82bc9386f5879957dacc071 -size 620296 +oid sha256:9c2b26737ea815ec969a06387d037860c5535cb7811c0a5b1bbaed905f82b670 +size 610181 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..21f72252c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48d2d981d8dba0fc99faa58d08a738715d186dd5b6bc165ac151ea68060ccd6a +size 849624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3a8e352908 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6960d74e6d3082e633363657197451f5dcfe2c304033e6bdbb78e29fd8fa563 +size 581001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c70a5ca8f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59163155d5896f566f6efdb5e45824e29412c283ffe510d5f339155730690932 +size 641288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3c6a206d47 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2e9419ff6242278b3eb864138ec4e3e43f8ea3bf115ffe3b2527c31129b71fb +size 753078 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..20aa75c380 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a60a1f87fda3f7973ec2d20b5976bbbaf86836fcf25e60c3f921964a6ba19c32 +size 541533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f5a97eb7df..325e2c5f74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4d7f61027ffb41ca4a44eeed05a551337e10a0daf3558427b4d7b7857430b1a -size 802334 +oid sha256:a111f5469916097520385f283d4e41e9809fd4a0e18528153cfba09762961ea6 +size 772734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 37ca2ee4c6..5ad8c3af8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2960b869e36fbea72d2e2ae2b55f91554f98092092bc2f2f7bdadb9e811f821b -size 654924 +oid sha256:db14fe8b752840002f0db9c7b190f74caad90fb1032afdf4f0318d53934cf038 +size 640864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 1084e6003f..b95e253a33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:735cac1f6a8b0a25bd662bb97aba8ae0e00b4e70f36b62a650aec663fade26f7 -size 713236 +oid sha256:aab3283a3429162719a15bf66e3579cf31735a8d3cc79d7915e2a3b1b11120b5 +size 700014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 8feef0e829..8e29d4da82 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c50a3fa5212450f109abaaa2ff9f3d831698a33ae91d062051eea67fc2f55ba1 -size 680676 +oid sha256:2ed808c01c4b07ddbe0bde178455e8e10964485a0a7fbafb35efc285700c0ffb +size 668144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6eb6bb1ade..a87d2c5c86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba52bbc64f756993c2f73f5759de2e8df28eeb1195d0208c43474db5d29f9735 -size 614369 +oid sha256:51217c6e5601c36bbf5d5b77ad49448b937e1e9e6f813080b14d797df36e569d +size 602725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..83b7d25607 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:768c76c636c992414b69c3a4360dd9c49bbc67c4997922dd9e7f7f2032a94606 +size 796290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4bae6c56bf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02ebc17208cf8177b90b03c91ecccd48f5721f511d25f8b5722dc75624f1301 +size 564123 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9f7080b785 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42064783d7a04784a87da035ecb9c54d0163543024a344fd9f26b672745c9c57 +size 616763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4560642374 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7338b48be1afcbafd3890478b8d7d706404031819d7842ff7793e4cc457e6db +size 692096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a1b04e1553 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb67ce88d96bbfc484f22b83691e7a0083ed15c95a2aa99855ccd2312063eab0 +size 532055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6c476edec3..b69ae82bad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb86de61c579ba776205b893359be245c38f650b7eb3decd17244ed410a7c9c6 -size 805958 +oid sha256:ca3650499090cd74f1a6e38313239d78084e1c788574a384b2156887203dcd09 +size 780404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index c58b4e85d9..f87d96141e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffda79f64742b4c989ada499966e5c2092731901be9f1fdcf738c71b4dbc55f5 -size 779676 +oid sha256:8f6a24adda5ac733580cf5527c12244196091a98750738f355319fc45babd5ee +size 753924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 69bba75828..f0bdf472b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e63358888fe274d51de4a849f2b450f4cd2cc0b37cc51596ed3637a568eb438 -size 717298 +oid sha256:76b7e6b64e6d1d8379f0af120ec507dcef9124fd06b8cf28388d4f88d6890651 +size 695444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8ae0722c70..e07c581880 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9153994f3e7f172f09dbfece0951774c78c4205a684702daf2eb6d384c44838e -size 688894 +oid sha256:e685aef2588df3b063c18d21439b720fd2181817d9b9c9b95243c0442e8c298a +size 666792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 413106b161..4a494132cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8463b8784ca19607b4593a006663376729bd6ec82fc4babbebed6000f74a191 -size 667490 +oid sha256:b4b236d8606c7796b9c6bc8d1c1db5ae41e0f6148ebc5b695bb5af7c44651c94 +size 654022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3d2ea7f17d..8001ab3299 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f6a76f476b22b943cf74ff44bc1f918b385609ee884c8c801f91e1093400239 -size 570097 +oid sha256:d213fe19921747abadcbbd8dc4c93c5b19166ce0f8878e80d5629b1b2404c564 +size 562105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 353fefc4a9..3814c13855 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5249c5d7e8c2711a110112a8c98d126ec6659b079998cad52350bc908e26da3 -size 741242 +oid sha256:3b0403124ceabe28da29553b09149d193384760ffeced875477eebe69d1a105a +size 728514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 807ae07fc3..7880425b9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08604f4f262cb24a31c9b7df4b089de2c4d8a4e39ccd72a26d8ae899008dbea1 -size 647846 +oid sha256:9031df3d4f98226cf2993c9e0aec40a75eebbedb3c21931b4d5a0e1d5d1d16ae +size 634132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index ed35ea4dac..8af39b25fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c59e23a91a8218aee087df14f521c0f6b100f164e9681bf769ff9990f6cad689 -size 674298 +oid sha256:06435673093e755f1a9d304e9df9f884cc0d51fa40fc25e083ae1966aa25a4b4 +size 663148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 27a4540d2c..5cd9ce227f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ab9e1aeb419b82dae68f85a77ac348294fdc3e1e42cc2640b134181101d3200 -size 587313 +oid sha256:295f8bb5d7bcc3449816c6bb203d55ec213833f7c8aa487f3eeb7aed968b792f +size 578877 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3ceb1b0b46..d8fcc1c190 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85634a91e02030195c351577c7e08350ca2a45ed397b5a95c4bef45c35179318 -size 619634 +oid sha256:705383c96eb8af993baa2a93e3c40192a37adab5005456e04836eec7deb52aaf +size 619536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a087b21c58..a2bda22196 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d966edbabda5742b09205d251b89a654167d99eb12c89f8e38b7fe4ce434bfd5 -size 534327 +oid sha256:a492626206afaffd20247ad6c6052057c74fb0d69cd509b100b7ebe74b1dc856 +size 528901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index baa6610529..d88bf6b69c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23809c5b752cff00cf8d7b86cb84c2c1c826d1128f20eba69eba7ccf745bc107 -size 830748 +oid sha256:3d475df90684cee4812aa213930aeb6469a8bac125440a19361af7094b99fb06 +size 803268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..50f885edc9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b520d23ac4828a90ce03827f92948995f6beb5d12a34a3a27c9532885b2ca27 +size 779404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 75c5f924f7..a75306df6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b83527ee31e302b86f6b405cf2be6a6e135a5b2d112701cf970d49ef70d3aeb4 -size 739966 +oid sha256:4e2d8a7b775a19d38025c0c59a0f4d580aed8a8e7bb835f6b860732617ef6f6e +size 716632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8f4eeee2fe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2329aaf7b745f918a7202896c72816e29408053b15c7a8a7cf45e9b473b7daba +size 691188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..92ad5c1255 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30e4e477a958b0d054838c2d8497cdc18b30612d535f5e64ddb99e9e67461b5f +size 585865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..835c8df151 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8562f7fb3cfc68379b01f48d611e774aea5cf1e1e71484faa2c85b64e40b022 +size 492717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ee2289f49c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:214432a34d828a5e24723928751a938d69d8720542bd90643eaa8afa6818182c +size 662580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..735010664f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc2a17642a1bb44751860bf3b44f3e6d4d8044fa8841898b687e9d77a1228692 +size 555419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c53d002db7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff1db0b3415baaf51b6bfa5e6640d8b026a0966be9b8ba9c3f36928cd1458e54 +size 686506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f9a1317d52 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d10d73b56fac98a0c6d3b336c90c425086344aa7b2342ebf44e07ec828bcf2 +size 601447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3cd7db1859 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0bacee71a76d6ef38511feb31de33108f3f9dd4d330ccb5a23959605277e033 +size 555919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5daf973956 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8fe1846724eb0ad7374ea9f1d883882a123b0830032b72eddad66b61c1b9b1a +size 461831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c3bfd72281..2d9ade902d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dab880f09d9c891c36660e6e65751ebaf243f36eb15f4cced5a0d6f9ab8eb49d -size 830558 +oid sha256:86b41407f0d17234d6c59c785d88022a216ea01a510bf96474ffb5490d2a08d5 +size 822320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9688de1f73..1d61c12b9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d212169f751de24f8c9e939937247333c1cd7a9b0126dff58fe38722137f817 -size 856088 +oid sha256:9607880cbc90b5ab985820d1759494d10d7393062af17f2549cdb9999c443803 +size 846764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index b1b11202fd..19bb966f39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ceda76678632a796bdadc9af8773680e69b6eb83616c4ae7a96773d9ddfe71d6 -size 795230 +oid sha256:713371bf6a85bb538fa4ac9a093d41472996a0a4a3bcb225b5eb0de0ae93fbe7 +size 772536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 94dac52cb4..6419436537 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5830787675b6a7d52fa4128a5b983d915639f02668ccd07e9312fe157b4f7c28 -size 820710 +oid sha256:2b48747decee127857e694383c840488ce0f9ad34723d0e4d5d2ab97263c17ae +size 796980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 6161360d4e..6bfeafdaf8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d6d46f14d6cda6e6ead4ba219e0f30918e07cd1470b2b0d7c53ab81f866335c -size 761422 +oid sha256:ec2849c06578079b49e8becb409522ec6e265cd1276dfbefc7d78a21024ed449 +size 737692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9cad9b611a..ead771d902 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5bb23f7461aa74021742d063c00bea16756d96eda295a6bffea039c68e7fb64 -size 676462 +oid sha256:e37173963f30d5e18f526643cd7d4c24c2adb2a803fbd80f0a81a2e00d648fdd +size 653028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 490a222511..83789d06b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e38fdb1e82c9c625185b259e943296e7d48f5b9517d4bb47101709e1498cbd7 -size 788678 +oid sha256:44efd19c567667e5ed24facbd066d8d59dfe51ba29d0ead3a5fe9a88d4b70374 +size 763320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index bb45dc989f..97f62093f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:968f355fb9a7ce2cb6edee8d483357258bae7333eb57fdca89c2bbd3bac844fb -size 702040 +oid sha256:d9004528e43195563702feddde11b8782cfd63a1216428d8e2c2bf8861c3eb6f +size 678312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index a6095ee422..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7bb19fc4b9f0b9dcae9a8ac7d38611c04cbbedf480549ee87c67bf4a2b7e575c -size 579717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index dd9137e692..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:44322327f1ccfe433506e9d353dcf5940c9cf89dcd9e4fa49233ae97da7061bd -size 540841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 8d7357f31b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7db2093a43882e5ebf37079ed2181dc26d447cfb4cd8ca9f59d5d0955c1bfe5d -size 565651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index a700a2d4a0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da9cbe23434088c0353491046d32144065539c1302d07f751043fd2d2841c183 -size 532793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 847931a8ad..d6836b2c46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32daf25f16be4864f0042a6139aae7a2ee8c75b523f095d67629432797b20e5b -size 807830 +oid sha256:742807d4588bbed7b2d761290abbe99693b9b3067516a459820d59e10c0eba0d +size 779662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7f96d77388..9034f21721 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e54626e911ea2f1f3969543021bb36cb1a66949ed9fbe5a9a5563a890643bfb5 -size 719318 +oid sha256:949b2301c84ed32e858cbd5c0f33077bc4d19ee8d8b68ec589218c56bcf7df19 +size 696970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 0bf1c14e51..ad351a34fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93c1f2553af610a30b18b972caecb669ad10df584b5c464785a1df7ec5b1c7b4 -size 828032 +oid sha256:b50e3854ff17c50a4fe3833787fdbf2fdfbfc94a474d2392391600249ae94550 +size 800948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 1a419f5ba0..23d1add1f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1c2fcd47a738b40c681f181ab0dc789caee2d1dc34a3839fd7e9821316f094a -size 740754 +oid sha256:f3b63aaf2bab4776a8a52e95a68be6583dde6e88503559cd622bec2f40732a36 +size 717122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index d50ae7fc04..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:20b1b7367b0acfb1a4d77d565f366ae93b91be569f13cbe049af9fb236e69d24 -size 582657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 7341904a44..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a1872f4ebd74e74e2cf9f1cf4e3b238813a4d7168dbda44d85767c54fe266cd8 -size 479691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 348f4597bc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e54ae0cb7b47e4115ccf9fa3b23f4db901de74875fba34a0698eb3a33c81384d -size 556607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 258529909b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2135653e61c1d7f6663952ed2b3d6c43aba29470e9c573978cf38da122f57acf -size 456897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 528abc44ef..0627a1e5d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1848efa8a693447e4b6f0eca9cf856decfc8162fe6ea9ecbe16a27f7f2e744d2 -size 885300 +oid sha256:c914628f9f7492a71038378c7bc493585184afca445a9ac81713d1eb03452f11 +size 877112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c8d774042..62ec28423b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a883b83b4fd8e1a046a2e5816c0b8d5b5bbdcbb6dbe7451a9d5246f446714b4 -size 694872 +oid sha256:d1b754b41a2cd99dc8cb767e415167a33ad5fb6288398ac121e8ba8e76fbac0b +size 685646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 1bcc47e5e5..8955c37fe7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a87f21bb03925556b2d436ecfec959b14b452b8b932516f189c652a23e9846ae -size 771190 +oid sha256:e5cdadc1456b66540e5979ce766a2402242cd992e32311ed375c2eed7e0011ef +size 760436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 8cab2baf7c..2d384b8052 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3196d12b6671bebbda71bdfbfde2441b4dc256e854f28b01d1c254ada52fbf13 -size 778294 +oid sha256:22fa62120e7136be1896e8789376618bc7c5bf7fd608c381b0bd5fa07f6b22d8 +size 770006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5000146482..d028972fda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3cc551aaa59fc4a3b3cfdb16be6f372e17bf81ff524d4ce0443d89f3601febd4 -size 647708 +oid sha256:d9a29b39e1012330a67072be41970226a71a73bfc1c18e3675985ed961346c99 +size 638432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0dd8e33a89 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0635bcdfcee040608556d97f5ae000debb34a9bcb06a73d4f50f9cf11dcfece +size 901900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6342490d8f..caa8d03c69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2ddb3d7c6456c367eb62d1fac45b6ebc20c1ba8ca429bbbd71071c803da0ae4 -size 605157 +oid sha256:b2c81f15488e146a07449598d38feb5fd1adba9548db5b31a95379b698b6bd7d +size 609647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp similarity index 81% rename from cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp rename to cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp index 96d838c403..5856986115 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02fa69d04b95b825957b85392c77368b6df94ce45e19bf74ee821ce3aa275b08 -size 587901 +oid sha256:ec2489ab92395fb565a0013bc2265191faab54987a0ca71ae8634a42072e9fc7 +size 673930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..eb1121c491 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f53d2f0c61bf73f1dd2b277d3168b4a5bb627b2e6d0267448daeb868c0b9cad +size 792526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 16199c68ab..0e4195a4a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e2b8ddac441d824da9655f9d8e735f47de96f06d1022784b1630355cb5262da -size 565491 +oid sha256:30295d91fe6e55b6db03fb457b3d12e785d949dd5aee1e53021e5c9ae0207b54 +size 568057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ac07c67795..d47f07957f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2dfaf60abfeea269ebdf8766bd36c7916c9b6d79d2ddef9f7d31a47f234e803f -size 856286 +oid sha256:944d5e714d70b95c64276e03c1b3ca86852d5fbacdd71c320cd0d4a047312ba5 +size 827624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 96a19ad179..1c584ae327 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efd8639a69296429291657cb8078b9e8780c9778dd15986e24a6a14a32f6c504 -size 683816 +oid sha256:18b648c66ab7468ff54b01a198bb573c8b4b91546bfb7f1af29e5f1cf93ced4e +size 671876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index dda870d66d..e0e7b0b880 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94119c4ee6ffd0f412faf8120901e73a1212d84d753513499b1e189273041c9f -size 748640 +oid sha256:549425c8d573d9fedbb7e9cabbe04c4df85088f3563db39e5099b781eb4f78e5 +size 735172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 79e629f116..69a0b1be1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb8b378b8da03e2695bae1422e60330213d618c8a8fbb6bf72f70cfd89880f68 -size 719532 +oid sha256:53d29156db8540f3de09bd8ca3782e7a2ade56099f9d6634e4f4321c15c8edf9 +size 709172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 601539244f..a6f7ae54e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61d592fb2db427bce7a40d0c29c792e15d7569fa6c229482ca38f9cd0f12f4bb -size 641782 +oid sha256:e4e1beff1dbea771ae82cea9a13aff74f112eb69a60864c31513b1bfc9132011 +size 630978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1487340540 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45817cd25f0895cabfc1e77e734f59901c2cfe2f3d3ac46c333bb9c90bb65d26 +size 852364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 29020292a4..e30b62b221 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:489ce05f1bdfda166f61af341a980e39b13e488ec3498535b646b2dcc9864b3e -size 591091 +oid sha256:d667245cfaef268e1bafd39a70f541ffa66a4135655501fc93af3e2685a2adf1 +size 592769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6a466cdec7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:140d4cc92ceb6a19246c8144516bf381220ae6040cc216369cae0bfaaed9e9d6 +size 649404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ec53be74d2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e38df65bdb626917a9bc414976490453939ccc4517989c806470d5b13aeafacc +size 732236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index ab8759c3c5..41c809d785 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39433219e25fd44a92bc1039d07c1fbdfd7116eff80f32e862339d5746626d90 -size 556705 +oid sha256:16b56a2873c8fcfc87fc75ff37992706f35e924f2bfcdfdf3382887cc2ba8dbc +size 559417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 17132efcf0..e703cf7aae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec4cc7f9222e0b7152091320ad7382e8495c47178c9dbfe71b6ae52d1acfdb1f -size 877080 +oid sha256:611e8de315d27fc08fd78b346f90e32ad8f18128dbbe3112149a52066fcc16a1 +size 850834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 42cab933a0..b291cf2f53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a55510a0352bbeaea504e44060bbd7e7ac2591827c974a6c2520e0196ae998eb -size 828794 +oid sha256:bc20d1b965a95a8e5d125abd08e72ba3b63e569b497161897dff9022851778f1 +size 799688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0d1fbdbabb..8d99be7a1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e109111c66c13bee588f54a55f1176d3da1003bbd38d80b26d18faa14547f249 -size 789308 +oid sha256:33173c17d5f1717ac6be783c60629604fb64b2b08f90222b0fbeb21f780e5fc2 +size 767502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index b2f21c4b22..4ae3eda289 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7295dd10dd1a3ff5b68c04a747809b4bac4aad9606d81843114a631cef0e87ba -size 741170 +oid sha256:16b9fc03c97d2fba60b9ac12c99bb1135342f5720c170bd67a2ee762da645956 +size 717046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3f6eb33b18..1838e86af7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:101537423ecbff3a8b1253a400f9a7f9bdbdf7edf6d7f1ee639989766691e671 -size 695986 +oid sha256:8b79444f6bad07155b8534f1b60ea3081822844b22c1e4f4626c593c7a38bb41 +size 682322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6bac8909e0..74a424340d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c00b1edbee9d12281cb148aed1eb36dd3f291215564784b93661eb7564914545 -size 596917 +oid sha256:a4bbf392a2afd301c2461935efe2a61e2386bc1acbf9bede5fa4bdc0fa29c6c8 +size 588825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 55c159ea1e..c7fe13e5a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b838b43b86aef5ab58b79176643330e8384d5b01790a8461c57d66dd0c9376c -size 774526 +oid sha256:b0be378bebc5f7e8af8de8b45cb3ae52a7d0d0c728f2c9c17431a3aae49e856b +size 760810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 79986103ea..c410ab0256 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17f12721b79cf6404535aa9e23699672ae76e2c165794266ff13fa47d07a4829 -size 681228 +oid sha256:2462cd879a458ea5dcc9f6008fd578583d8a153ef430608eb24768eda9b523a4 +size 664948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 8e0a6ccbcf..4e4653f3c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cfc5310800d1610205acec43f1c5d999f0900bc8d944324c6a319e371769d93d -size 711822 +oid sha256:f711bd5414140948a9a97e04d1fe8658d50f15e9ef36976471d18c39eee6d322 +size 699490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 78e9a985a1..bff0b7a678 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fab021505ce061b0a1e1857e94d1df6a6c87df304aba02c0461576982eb7845e -size 625926 +oid sha256:e559618ea57764979a5f11cf1de0ab0db96b26675ea711acf3d92cb8db1cd119 +size 616453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d21a0f5fd3..61cbe9a8aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92da058e900266d60836ccf4047fbd71471baa672198ceac856347af2f6bf230 -size 654742 +oid sha256:11d681d42d12eb6e58ab2a2bd93ae389c610c16d8a835a585511bdd2d5899aca +size 644728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4f6f4d7b21..3eb70d2160 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06a8d2b9f62d3033e1bd7bc22322255a1bcabf4a3a5c6527157140b9b6ae2f12 -size 558731 +oid sha256:3ffe8209696998cc78ab453c7b71547010befd4f5fea477f1417683e46cad2de +size 551873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 30b9ce32fd..4677e232d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6048aae5f3b3decf219f2b24a473883d053bf1c51a0273d95588b40ed8b72b2f -size 902954 +oid sha256:29bf636b65e80252ec373ebe61911149a06ac0c48829d6b748715e99847ca10a +size 872516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a47bde1c25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df7630be802e826951efe0b1871772cef69b34c770e383cb153a85d6cf2f5b1d +size 825908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 11a57eeb0f..2f0ec805df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a460603ddbbbb6f9aef8f30221a8823f8d5b993e223228105d9c79ec80de951d -size 812962 +oid sha256:312529056a3bcaecd92edf46029c47c3ded74f32d3ec878f928a2c248db1ef39 +size 789628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a5703a8677 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0f60f2952f2dcbb116ab77a072ddb5eab5ae88e3e93842538e1e1b88e2d3792 +size 742032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 25fd7601b0..9edcea44e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5d3dcfab205e70adf139b8494a4431624a8a4b58c6bc0513ea0450dcc47f92a -size 607357 +oid sha256:ec924dc293d4301307bd91835b9bd35b66cacc4dce4799ca2a02c2d95c342994 +size 625218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 87fa037531..13e98d51a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e43277675b938054766f9923303c0e27e5a8b837ac1a8f6a948e4ba9fb61db5 -size 503405 +oid sha256:be3c1934c6ac967a37a8424864b10dcf41a75818746cdefca6fab5c41bfe978e +size 521213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..913e197ab5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3604f12b05b7e8e264fcb27faf31c87038572ec4d7c3f2d6d51a9effd838cbcd +size 696800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..48eae4f68f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d3bf3ae1284250578311feeb202dff46969c1e93a8617d3ff09ecf8f6d1fdc +size 589787 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..98e2c14520 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7609d1b917bbdbd8be6e24e59cb9032d8ef09c098bfd62c3981f0680fcf979ae +size 722058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5b6dbe0406 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3137ff7053a26561087f4bfb27e357b2ecd001259fd8e9f6a42343a8ab3ad051 +size 639022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6de6352af4..36cd83efac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a3cf9d4112941eedc831f1aff4b8286a0564df56ff9be6c612350feb4abd53a -size 580075 +oid sha256:643b82dbd3ec9a980ec5a07b27c75f9d81ec4966dcafcba579832c212293f975 +size 590187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 23f13f5f57..6d257c1d00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64957a61ad5e0e5a8ced50c65075d4b94e0b52b4f0b9b8634e31e8680cef9070 -size 479031 +oid sha256:c31d519a249cb25789ebc48b47afb5707c3f635337e7cb13d766ea1cc26dbfdc +size 489145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3f34ddf586..92c38ca549 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7cd4465fd9ff6695ca696bc6188c919031d6e769fcec16c2fab57d2764f924cd -size 699494 +oid sha256:cda450d24e150f645c8556d5a832d29fbb85f39cf3b18a70d38ae2042b3ec479 +size 687062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5a259f0aa3..26c623b042 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf2070c8ca1a2759794cc4f410af6c1b72b74fa52f273d8aaf56c70383aec4d3 -size 605407 +oid sha256:34bfdecd5b1a35b6d9fef4a02eb8b6ee2eb2d0c6760e3c5fd40ad87080f550b5 +size 599191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 036bf5b4c7..93e6a3c47e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0a9aff03ac54d0e97f485b157d5b27fce32b1e660d4029d9887a904a65c6ed7 -size 715700 +oid sha256:1605d61c08b07faa5906d3d8a72b84b576004f4c3f6ec068e1deba1eaa8a9e4e +size 705882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c57655db8f..d4e7d60b9e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c810fe636a2538b53bda19d2f609c1298de3d7a13258b1b35fd9ace1266cb37 -size 623390 +oid sha256:13f998561b99cc3d6b9274c1dcf1dcb153a45cf86533f41fc9e5de9f2dbec40c +size 616333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index da6d95e680..e794970173 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:378d63c861b82430399c3f73814ce1be0a56694ebd4b083207edd8b36ec9c947 -size 696730 +oid sha256:b952f3472103486426acc40af294ea0519a6d6c6f702f65f296b40778f6a8176 +size 687800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index cd8046d117..db03a29e7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd258084383e83ea68a623a97052f1d3d4c6f7a41c094b36f78fd3f4082fcf48 -size 608859 +oid sha256:fb54ca707510e71cec98456a42811c2320e86c46389b0c1be2da894611d11df2 +size 600423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7db281581b..4852f912a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95d820b397b4c91d07cad9b51f7055307d5b6efa140f651ec314b44c926026ea -size 714070 +oid sha256:defd09f7d7095803fa671850a045d177b16ec0c869a08639e3b00ccb340fa289 +size 705930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index e4951a7af9..40537e912c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2b146c32bd4f47070e9de25154c5cee14bfa526f026d2cfd3c9536b3d073d02 -size 626052 +oid sha256:595e90c0068ab374f3a4693667ec35cfd892f7b106b63cf3abac943125edf436 +size 617565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index dfa20fd22d..60ad7d9187 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c15f48a41ef13634705bebcb1c512946dd9d823183fd583c2bcb5209847093ca -size 764992 +oid sha256:320fb84dc9acad25d6c47af2fca997ce3d95d1e87597011b55362269eede2588 +size 756062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 10a3669498..d57001b64d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a34f9f52aa89320310d4279a3ea0e76a2f24ca0325b09ade7c70c2553fa822d -size 675246 +oid sha256:b1a28c5c8421764062fcbbed8e085441413c9bf3efe906f1dafa0aeaa0758b87 +size 668290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 97852215e6..bad505ad3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87e36d93f37bcec407790a278553fdf369759c79d5861844c4997dd75d99fc71 -size 783122 +oid sha256:6c339e602a12d3a36e72a546fbb2bf43d666617a8f9d280ed89dd004093ef5c5 +size 774192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index eb5d43ca7f..e4d18c6cdb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81ee6135f7046c9aff87c8f360ade96ca2191a10537f6cbddefb1d1a21065d5b -size 692586 +oid sha256:b06b28905ffcfef76593808125f326d139a85af5e96ea90051ebe4dc4cba8426 +size 685680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9d00fe44ee..35020a2f15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57cc7e322f5355521a8cfcce03ae9e7b30be119faa2ecc5ace88a69d85bef13c -size 859468 +oid sha256:9301238214888d2e42023c6640ed7244db08ad0c8dbf7894fab1d2c47455c717 +size 848860 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 96c79b1675..abc5f41423 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc00c662db2b622a6bf5b6e4f287edee6d026383296e759a99ef574fe21aa3e7 -size 768100 +oid sha256:fb6eb110b578a6d82c0183c739d728dffdb51ca9a1d69b35bd6d45ebbdc9c735 +size 756802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 09576b473f..039dfd30af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:883a775d5c788f305ba7182be869f27d5e026a708ca9c34a73d173d707984003 -size 853890 +oid sha256:1dbbcff2fb07f97d82c48e55fba867a8e53919aab891d06acedb09ab849382c1 +size 841014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e89ed7e431..76cbd40b84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22a3f28699e904ac0c030bd4c43427aad96b7d8010d592a06eea3a6859f0e069 -size 732086 +oid sha256:1fae01c5455d2ff1bc7bdee34cbeecf309c704799f451d2f6973a1b5ddf6ec67 +size 722960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c8485beafa..40e45fdc67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d95df1f00c3c2980f13aed425a2cd2bfde9ca539b086c2c7289c0d47326384fe -size 725326 +oid sha256:7c2ea063270c14f99cf1f6e35e95081c7d4d9975861cf1c544e91ebd255ca9d5 +size 714522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8d562b8bff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c01820897eb2af5b322daab531641b3bce6cabaad660b91c8eb3016d4c7c49a +size 873304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c5d14af579 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f63263ce9a9d281b5580ce896d9751fabe85957a16375e51d97ea23fe5700d21 +size 638722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ed9ba0e79b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6064b509ce217419b2285e2886525c064513e589e5d7e9e52d07d7d9a6d6c7d +size 705766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c4ad50b5f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9cecafc91b07368ed391a68d89483990acfcb7f8e04ac5cf81ac2e7e33481f4 +size 744444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dce07ec7a3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c6770d847aca3e846d8f16986b2c00970e2beb446b1d4a641332d0a45bae3fc +size 601275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 0eb4bb5d1a..28df700405 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:456ac48435859fd1d0799d8655eb655a3464ce1d1514b5bd041dcaa68a0a1ccb -size 765778 +oid sha256:8910ca110e5d02729267917ed437b5f2bc9fae76f574d12e9ff429cd5ee947e3 +size 750040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 611e858fdd..05a1a76b57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f9d2cfb2ff1356768425f3da9252563732c69a4b91cd3f2e187e0e0093627a8 -size 751320 +oid sha256:53feeaa6303587e68ec9f4e3c919a1dfba19eb9e5cc72001e7a7ed1cb1e6bfe0 +size 737360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index e892c7bbe2..b4a279b3bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:409ca53392a9bc74eb37393994ea42d006ba0e181e4c93a943cd222f85477ce6 -size 822360 +oid sha256:26ff414d6c3a12acff7e4ffbd69c7eaeea7d15ec65b2d1ff64cb9560644156db +size 807610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c1393ce4f3..471572bcf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:459568a121135fb059c2c887536b76e49a712446a80f25ff9ba718846dbd6aac -size 666122 +oid sha256:4cbc61d4e8d902f30aee3b8de0a1e3fbbf0522f74f447cb95016bb4d30f63bda +size 651174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e2d9a436dc..a93922bb07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02bfb8a1f9be76c902f958cb6e0b247514a209760b04b9fb0ca7a1128680d3cc -size 715010 +oid sha256:5321aee92ccf66f5e6380b10e70cf13c3e873195b0e224a477f48d81fcbaf794 +size 701344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a601e675af --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2245dbcab1a907f72d6010f131a408b5a688f1a6946218a6d2856ad6397c1e68 +size 776458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5f1564d469 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c07e56ac6543d8051446c0c22501b9e3a8711e87b1cd49b0aaf0ff5226ca2b6 +size 617898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1c62472324 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6abd705523ecd8bd081661a52477d2b288e9c7a4814e0ab5435fd2a1a9e0779a +size 675568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..77577143aa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:531049704613a9fe81c90cccdd5499c212372e895b85ae096fad916e131bb238 +size 672016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1791b295c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a98c5e78fa19d5221a53243a4420b96f963e21439fe0a582193e2e9e647c75b +size 587457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 98ded2aabe..ec4cc32491 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:842ae771625ed7e8d8121ccd10f76ac2b415a839197ea5eccb72329aaa7b2004 -size 787014 +oid sha256:ea1823164eaffda1dc2c21e92710ed124aa9e72d07d59b86144e8d0e684dd03e +size 775568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 939170594d..885dee1e93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c976301ca9e2997dc9f2b4488ce475c8847d775459b5519e985a4b47cc978f01 -size 752246 +oid sha256:be3eb2215f7dedeccda491eb287e10b46024365ad2a3d41f813bc52e0897e147 +size 742676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index cfaa2d7159..bb1475a2cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:802d40a5f0b59b3159bca970444244852ef61e0f8739364ecb7a0b613a14c3f0 -size 694900 +oid sha256:5769804e3fb692fc22f7511337feac9c723aea34a4c88cb76bddcbffa056e0fc +size 687254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a88ec1c6c1..ba7926f352 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1bb5ad02a0e7d04e42ba58d577de9365549c5c0cd0f732ae8f2a59214d22907b -size 661712 +oid sha256:929233dbb7a1c7faf9f4a54306523c6492e615a0cf4452ebcdb347e9319cb02d +size 653768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7021896e58..4bfb0a4eeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf5cda6ca6c0387233f955c1e108a22681a5cf66fdd28eebe308562d90ac2d07 -size 765910 +oid sha256:cec5c0a96252040b7a297de15c20eee167994fe21a42e96a8c3cc91f2739764e +size 751702 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 296845ffd4..1126983510 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c5dd74f13a327d7687c9d250c5058652176ad670611ae810c9877a9c9d4494f -size 664522 +oid sha256:48336a4540aa1fdea7ac85d42c10a7642d3e32dca85ace91b27e80e99e8ef2b4 +size 656530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 26711fde2b..e08daba08d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6381915a2a7d54127b713730ed7155437d5a1d6117fef852d00a639490d76b13 -size 850220 +oid sha256:d163b33757b7af76f991285fdcf9ed2a5303cd16c8b06b9aad6f1ca1ef378272 +size 835914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 3bb150fc5b..e246a6d00b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:969958a8a31ce39cece1f8ed42b3e6a582bd46aac8438276dc7e004397df2545 -size 749128 +oid sha256:e8ccd0536c670ea7ecaf3636061daaead69ae40338f0fe01c25966c327c1b70b +size 740594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index a875deaf72..646e779997 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2b9b81368fa59ca18054999b15854631c72f64f4cfb3f900158ed8a092c991b -size 659990 +oid sha256:8692661569acd03294b61bf4783cd857f73bd275d51bde68486005eff405cdc0 +size 648200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index d0d86219f7..01d891af91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:448f5fdb6dc33f8585519864e2e138fb6c6c9e163ccd95139a448ff299cc537a -size 568419 +oid sha256:e080a4198dee8b0e256ef2ad23dd387aaa195a0ea161366fd75c365afdfd9d6f +size 560969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e808ec4d3d..a7115d9e87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:418f6a5a2f69b9b53079d7ebdd3a304c5156e2fe69593ae4841e3585bf22214a -size 715094 +oid sha256:853eab3cfef7d8e86ce0c6d96e835eef9a5bb390ebafdcb241cdf9b96ba45a1c +size 713220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 322d667aa9..69f9780c8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98239224d346267a9a89d55d6b150713507526a6236c0d3e8f50fcdd8b4d8126 -size 625300 +oid sha256:ce7c916c562f002716895df9f8a72d604abe8beedc83982a63be8c3a0f9f3f5b +size 619182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index bf0a2525b1..eed362ec36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb4578e91ae0b55685395709d04eb5956c82aa3ee36f56b585b372f8956c32b9 -size 804304 +oid sha256:a57733d1504dfa337fd39a9c3191c1f72a1f1e221708840b77b4145c1137d705 +size 791824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bc6310f7ca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db781e62c7e3bbcb4c75804396031489e641256605588fc4198802a918ea745f +size 767268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 4c531ba02f..fc723c0aa5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cd6b471baa3a552c1d4956cf493c0103034a7f5bc2c6f88768f2430f6c838e8 -size 711944 +oid sha256:21ea45628697b10eb7e8dceab437e526ed14bee6af4d9589ce4dffbc430b59e7 +size 703558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..47e292e9a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59390b11aa75fecd4bc0a583cfbaa1c8b8888f11c0d94e67e97ab3ea66b5e01d +size 677720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..33aa480d2b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10379cbd374a180a2313c20b5c7ab7bd941e8d6743e4eee15d95453caa37ea4e +size 639542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8ef0121dc0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2709d70e93b431a58ad9b8fbc87d6fb82fdffc67a496fa9261f9a3198ad4b0 +size 543629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e5f93e6bfa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4ff791eb824440eea571592c474ec26c3b093d08b41fbc72e4ca1f7f18c4e9 +size 718870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..94e3bc386e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75bc46320d4eb756d4df73d74e23f127010e772e0304c3cbbfe4c2c937a4b23b +size 612745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c3b55227e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c7a11c95f0fdedfe554437d09560972c9f1b7822ff94b93ed706cab993e26a7 +size 673582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8da97db5b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6db9076002880e87736c6727f8e2989154a2dd8cb1e0265289b72ca17705b7 +size 582453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..26f8e67101 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81cb06ea423674398fa9642d159da0266df079993dd034a00dfc018289863067 +size 606535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2497aa2149 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37a6199b9c8d2400d40dbce23b9119f2091ba402b24f07a4f3c282e373b49af6 +size 511807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index f5f46ffb9b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dff6b96cfa8978a84c3e33a8d52a702c3f90a3632fe16c049880a870f8f43d8b -size 637932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index c046d5b4ae..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:124b5eb8f1f85f040b1c57b16fef6d5d5f54e28b14ecbc252aae73c711f851a5 -size 602409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 94a8406ec2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6da19adc7c6770ddb371ab290109365c9988918edf75dd61c5581364c71d356 -size 620412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index e9b589ec53..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51d8c65f56f32c401fa0b7737358d6a6916e259de9ffe546c7bcef6c673aac94 -size 591303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index adc226f3dc..ebfac28579 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1399bf9f9e5580a027fb6ba200a30303b68b9f6d425540fece8378e7c18c1d2f -size 787062 +oid sha256:2f5fcc4597783638e1f527c9313fee67370743ff33196eff10ce567106cf60b2 +size 774728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7e71af2228..5778f0c809 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d72206a224d14ae70b38b7e35c80b0af64cf492a73a85bc5b94ab29a1fcd6f67 -size 698154 +oid sha256:5b367d32c23b88b599c7c5b3f4548a8e15c7b3bebd9c91478cb48a219fa54c4a +size 687498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 1eae1b8ab3..c0ce8535b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d294b8fca1be06342a440e23852f2cd27aad5a31a62557f070a962eacdd533e -size 807510 +oid sha256:b6399e803067c8ff3478872e9e134f101b38735e8bd0448e44def1ed2e51aa8b +size 792758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 27e265fa6e..e440380213 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c9cc24d945e0e757097a6619db9b57092bf1f4b8508be2d32528a4e61dc014e -size 716136 +oid sha256:31fc1b0b714a6a25fe7230c3f2697f9a67d479953f2a49f23393dfb2be8ee27d +size 704740 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 4c1eddf48e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1bf32623b5cb8ddb283996ac9cd0027276b5c3e63a23edf6b71b4170be856ba1 -size 636826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 3b93d2e883..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb08ed74e1c3ac49e200c385c8ec6f9c795882701fe1a2484dc1bb6f68a540ba -size 533711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index aaaa383a42..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e26dfeb627e0132ff8a3d2563848e44b17fa61ed575ad10b15b73468d7f5420 -size 607273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 9ed8e6286a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5ab23057841bc8f2573d7e6dc5d9bd6a1808045a1dadda86a1f7fa1dbe8e3bb7 -size 507365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 47e1a1ff7c..29ebe96228 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd60ab4f2788eaa8e52551af50784f0251d1550ca4e503162bf1e4863bb7871a -size 911102 +oid sha256:975654481fce91625061137da927b77b5c4d6f4f7b1198c864f5b353472219d2 +size 899904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4954200137..bf47e0eb93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a63c4cfd648bb6a6d08b69ef46df7af092943bef6641299a3ab687f05b875060 -size 796696 +oid sha256:d2481182db3d43a7be49f9753b34cb4be33c9a946727b62cfc2777763d8f4374 +size 784954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index b51d2061a5..2afd87cec1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2e097574cd2a8a7d5d829d964eeee4d6fb038524f5aeecf2427b6d90e3dd35a -size 887420 +oid sha256:1fb44b0034b1d3464da2af8713896d0d582d43d707468b652c846ca03618609d +size 874692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c8a5ce25be..b28815b33d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7523a8b2e42bb2e6cacdfb0bef85f0bd8c7ae80865d096c83134fa6af9befd57 -size 770944 +oid sha256:73b27043961cf1315b40e8f018f77c5c9ea9efa6fc30ae01f930b92db1065603 +size 757970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1fe44bc09b..3b9e353a96 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0afb11816b9b6f41c473ba5c0c60f094f00ee577c1542480214bfe8ff0a80153 -size 751504 +oid sha256:bb659f999acfcb9697ab7463a842831b66c745952581e97685e1b8838ba47cc2 +size 740504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d8bbbdcc7e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8123359de0f0ec0f3fd4c88cdde42e1f99b5f8d69f9e3d0fe7c5ee8f47b3e26c +size 923756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5004b4a050..7ec6658897 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5aa6ad07d1ef3e0dc3157cec2201ec25e07a6236df701aaad255adedd70a88d5 -size 664210 +oid sha256:f69f25f065d3dc1bcf85af8d995fc5eafc347fad00579229236f29ae7a8493e2 +size 666578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..00e41cdcca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83f204f24d6a47d913a88df876954d905730a6a752b75296a0dd1d368fea20a4 +size 739198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..042d4234c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e76155745432cd8e48bbed6226324046211a6124c3adec2eb66e1bb787898a +size 780242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7eabde5b83..fbb1f4afe2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6af81325680ad520c0edb0ce6f4941586a1b632e340baad75bccd7237ddf0ee -size 627850 +oid sha256:d90c3c87a4f0d2a425af78b2594425ed21177b9475390adcaea285915a2cc14d +size 627800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 00b1aabed2..8edda6e0c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6aba7048e2e09d75088b339fe808936d34486f349ada055aec0366d760209af -size 816080 +oid sha256:6284a116ec474fb972ae9f70d8380c73c9d297170df21e038ffec432613741eb +size 800294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fc935dc5f9..84ce7f8848 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1bee96e7cb16b388bd3c6d04c170628c88effb02cbef3bf531587a7771f86e48 -size 779916 +oid sha256:3323d9f1f692e46f3e678f389b50f6c10779409bc2d978972cd8f246a739b8f4 +size 765512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 8253ba5bc8..f937d581e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b83e635902f7cd9288a637b5676f190e8c4ccb65f93bedb301dbb1d916d5f9b2 -size 854904 +oid sha256:5dfa6cfe4bf25075c1177c17148d1f13795f52a7a4d36effd98f93f0340e533d +size 840252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index fade7f45a8..3fb5ff24df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de1c5724f86229e85c9a90635200197796b4b70fc807580f14d7d3bfc4b3c52e -size 703056 +oid sha256:36b906d904d40934fbfc54e7ba2185c9003a34b2e6abe1c39c3a42b8eee14ac8 +size 687170 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b169b8e59d..cbcefae864 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:def5eee09d851b10216d58ab529a7fee13c8ab169558f60768146a06045350fc -size 741238 +oid sha256:47fe87b16a931eb687ba5e903efa3c44a390a0ef5edd2092f083e5a927463539 +size 727326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4ade972144 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42065d17f793f3940846e2a2439be4da0631bd7b84e4cbe9e1f7c82a811609af +size 826118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 81cccb23ce..37a782f31a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d58edf03005ed7b341b4f5bd545efc99c6854aa166c1ef5364a58be02020acbe -size 645902 +oid sha256:5c994bb9ebfed9615747bfa0c158045893f12765300dbf6a76e41dd417ff6409 +size 647332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a18cc8108c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec27e4973d2dce12dfb4f12305e46543b2a777365bf191cc5dc96263ec96827b +size 708210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d90ba42c4c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc3c4dae9638a60caf22cb8947050e0bbeb2e1d0960f075ce1795d07a5bc1448 +size 707914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 77c13c4286..84b856842b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44808f75a32c2a7d3621b8f20ea86c8cab63b19ebadb65e9aea2a46d5774f53f -size 616201 +oid sha256:8e07248843fe923694750dce099ae19a73a3ef995f36ee10a90cfa50f0c1965a +size 613981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 526fb59519..eadd63114d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b2b6ad47e45a61495eb099c3ccd12ac698881582a143ca23c56a3036c04a852 -size 854288 +oid sha256:ec7fee85e37dee068cb9e53675da9baa88055ea0a3ac1a8de8e9142815173557 +size 841756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 8998710619..dbfd472579 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0dbfe572b4e0c570faeea6a4254740d1c8f90002e14341128388ebd9dd903146 -size 800328 +oid sha256:fd78fea22ff4523abe296659990fb0fde472676e9fe9b8bb6205a0f95199f1f6 +size 790560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e0ebeb5921..0161758188 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5070f10483362bfb989d8e67d0d56533aca25f7b53365fa477e554d946f0a572 -size 764788 +oid sha256:b101cd854b5500b0dfb7688059f2fe544b317d79bd6197ff0706f5bfc35a5497 +size 757684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 24b078c187..3dfa1a4bb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f42b96066f3a0a8b9475bec8082e74e1bf8d9afdd34ff13c4514bb5649aea239 -size 712162 +oid sha256:73d5d68e5a44f8341045de8d9b05de4c714ca86c577e2ab64ca0b9694bc0df6b +size 703332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6958c6d7a0..ed660f328f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a84f295030c096e45e6a5a0ba9b5981b8c215c952e0d9928ca024d8efdaec33f -size 793666 +oid sha256:d3e83cddb3c9f58648448d6aa81732fd2b8dca67698b0f1ef3883e26dd665272 +size 780002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 11df6c840c..5b30b940fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad757ea0120940aae4e00731315681dd7fdc8c5cfb24b852db71c2a62c1a6452 -size 691046 +oid sha256:0ad119a98cbe4b55d709d4d84ed28083803da49cc69ab1eb60135e0a3f249737 +size 682906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 1fae2effaf..ca090503e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa8188017cf5d142744c64f853040e1299c5a6b04522c114b6133ae027ef73dc -size 883552 +oid sha256:5e0604f0d87670a4d06eb2b55bc5107d5fa175e08b41e02a678eef3c24240f03 +size 869838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 0b00b02541..ca2540bd48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1740236a488ef22b3db7e329df1d20cf69501e6e4ea0c026288b19a25e339e3 -size 783348 +oid sha256:0da02811fe0e135daac53c1e4445ede8ae2886f0e29ea1994afe2262ae77b6c1 +size 772248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 2776fad047..9c6bbce772 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26ef4b9933699ca92801cccc79c61ae8b1ef346a9b37af8321250caa13c5176b -size 694112 +oid sha256:c62cb36800048bbf63a6d57698ea65a57af70f91c6a42f6a70e80f6b7ca4bc9c +size 681778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index a46e6746fb..5809c572f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3065a87ee60991572abd670b169f3b82381b7bc486e2708ae87755bd9a457648 -size 604957 +oid sha256:0cfbaa2dbc13dd9571e34eb179581632e951f5d1d1fb23da0dba902ea5a64ae0 +size 597163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2a8d8c2664..2f06c59646 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8be29b3795e6cfea36ddb9750deee9b5f62384940d27154b61956db5a2d20cae -size 750498 +oid sha256:4cec591bfa34190ffbac060ca53a7c36ec4e34aa029e96e8c1ccacb2db9e7c2e +size 738412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9de7994edc..7305a70b3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8b7debd57e313aab8f714365ae5ec47707f99908a4c12ae0961b44f7bcdc50e -size 649504 +oid sha256:d905e650ae430aa631f03836a5f253ea3f4efa10051cc21efc45564d23f0e45c +size 642648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 9a8f27c146..42b209ff0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d465d6eb0e1db90c2c0c971be0edbfab624ae337b2d5e41d6a1f13ed898ebcf -size 872516 +oid sha256:666a9e63efc3b6044049e731a345c6afa9edbd5bba8d68fa366d173378674b0b +size 859788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ff933d975e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21c98081f52cb508ec890bb3a8a0a0cdd71be905db88426cb17674181d1c195d +size 813772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 37783f61ac..e2c00f0c31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:377a81e97d911df8aef583cb3837d30c839d5cf20e5c505a2f25edcccfaaa717 -size 783856 +oid sha256:8fa41f2ff3f4e62e5fb91c1ae1ba4fe72b29dc65cc666893169c8fc4799e7684 +size 774926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..27ac8f5e75 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e7a0d484122751dc21a0e34ce11a28faffa2be2da7179d9532c5f7c2ca98d57 +size 727134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3b06074e22..a66e773abe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:233a18522acfd65d03768ee71373097c6d99620595811317c57fe8b43b618515 -size 660342 +oid sha256:c6b42edebeaa264fd5c28b1cbbbb66f95a929a5db4b75181e1dfb360767b1d64 +size 678892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2b7eac3f34..69748be152 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bc22db5822dd697a78fbc8e05f9ff729f6064ac36f45d55d10384087f64f98a -size 555253 +oid sha256:081a874e897ff61cc89a365a0036a1e3c97fa3d36d7926d2b47a5f2f8bdf0572 +size 573013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8815afdab4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9e8e63f3c3063b93e2e33e3d9162a31058a3d6f98530a1c62844eca152438c +size 753878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8af063e854 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af0bd5203160081ff391d4246610fa70116b0d89ce8ea1aac3049080d413c45b +size 646818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..12bb433bf1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d057e9832fa0d8b5039fe801811969c6a86e8ff87d653174bfa6a8fe7102b98 +size 705384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ef048c95ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:420c59ca6de6ed6fcf424b9fd4b372bd80a3a5f2071fc712ffa6999ecdeb4e65 +size 618598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index bf16424309..57dc6a2014 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79180e252cd86268d3062ac33eb39ca4452f245acdd709b3654d55ea4bef6ec5 -size 630346 +oid sha256:b8337b04380de566a15e09eaf89d727f57fb3816cc02e7f75eb5677b27029fd3 +size 641298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index f5baf8b205..fa8ae5b9ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f84582fe48dea1694ef5c7697a46091eceeb5949e2eddeb9b14b5ef24f75928 -size 528119 +oid sha256:c90eb9e103e0c171c23cadeb397c21c67c8ff7279efaf9af5430bf5fc6947fac +size 537343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 114ede9bf5..262de8f175 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34cc82d95258fda837b8f3541142698b6cf33b013d036eed6e8b7b692063f166 -size 648828 +oid sha256:8861105c8614d50f488a741bb346bc4ef09dcdf1522604403068a9958db186d9 +size 640392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 326d136311..97b5173a86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:974bfa0a18178bcda40e23f427c3675303f3f181fa5140877e13982d0ac30efe -size 565987 +oid sha256:d1c1a93b26147e326a7b219407355d5fc31c6d9b82396c142f739fb8a989143d +size 559327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 1beda74b2b..7a9a2adbee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:964286650e3ea3c254ae8b9ab690930fdf785e48b0c6e43b42cd367edf82e817 -size 675442 +oid sha256:b92774991174a730b3a3794a44dcf2a34847cdbfdda52d774aa08279a0195a88 +size 664688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index f08e521d08..b819e971d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8e075eef5dd181178cea24d0be1ce2f0241c74fe27a68c3d20581df1ddc1f6b -size 590037 +oid sha256:3502c563234099b2533cc4e712dc321c7474071ca11f2763011385f73f60a981 +size 582637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 6f62474cb1..6108d6e047 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b338566715a8014c78a2b6515e7bd62ec4670fb63ec17455a5f16f8c7f1e08b -size 652032 +oid sha256:54a3906505fa9cfb94d35f92aad130437009508f6035402d94ef59869e642545 +size 636936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index a6e68b983b..ccea8af136 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4611358fe394af0fcb1623d5bb3af19358c23d01ba64f5c94d8761bd65865f3 -size 566577 +oid sha256:e59fe3cad165e9e223e762d82542f6505fc3ebf2187f96cba709c831dc2db0ad +size 559917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b993116e57..50ece669d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3c56b50c4e6baa370fea65a2830ebccbcfd8ca9d86df56d278d26554ef88e7e -size 673220 +oid sha256:38b455435fd0648ef7dfa56c22bf6e24317e541a45da142dc4f057e20930f41c +size 663106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 837c9e282f..1db52b889e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:695fefeaf62f59e2bff6aad5209c6cd86f9a49b623942cc7ffe87370db051b65 -size 590381 +oid sha256:3da2d776f036f8ab5a3fee5d6c5272e8958b4b76fb766a7657cba57435ee4f2d +size 581649 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 126b2805fe..62558f7c47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ac41d7acbe00141e6f4d586c5afd3427e92fd70b8db6dfac33759984e66335f -size 717976 +oid sha256:5f8807cb848828696464baf1f79523daee7ed5cadd56af765a9fc6d0601c90cb +size 708948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index ad6dd6c12e..5754f60e7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:380d621e173cd66b078d3b83488bc492614ff281ce4a40a143fc691bc39cbaa8 -size 635088 +oid sha256:d2068373515e7623c140a0bbdcbafe603c4d0ab85c7f10410ed9fc798b8240f9 +size 628230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 045d22cef7..1cb9f52cb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30d553f46a5f3e13e1dd6498ca546ac8215a792cfde436a2a2aa28bc338efd85 -size 743604 +oid sha256:b543313bc7541b48027276e79aab43000ed993c905541ec216ce68fc48e64b19 +size 733638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 076ccacc65..1bcb9fc62b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:728e36fbb37dca9f415b37604bd33f95865c94c0800e13a1756c5cfab07f4f3b -size 661308 +oid sha256:8a7f6012fd6470652b6237696e2f57561a98400569b8e98db59f5b6b9f816e6e +size 653316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f3f3d817aa..a5cf141ebc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3253ab6dfc95957a40007f21af184cd37e51f40d6a83ea5d926795363de2a73 -size 777078 +oid sha256:e0b95991b373ed733fa00967f0439e8e17bb4dfcb579d15cd845eb7c9740c1b4 +size 764054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9c6368dd25..c53726a181 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a04a287dd1ea6ca810f06db05f3d7833a9650a025e5495d1a48e1a4bf5bc0c18 -size 645604 +oid sha256:eb49e94b904622fcf752e83e25673247f69d64f2a5fd4cf7933772014ce37e73 +size 635096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 862eccbd8f..4cb442f914 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce72d07e325d9b9fcb03e362667cd8941863f030391d1c541c7f46840f3238df -size 710378 +oid sha256:0ab3021119507766064b67117a16786bf1473bd97f7d3e6b077c450f6ac4ba36 +size 697256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f5962256ac..59b8b460a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:004562be62356559d76eb62c952273cb79cb9d791d810f5afc5780c0a6abb87d +oid sha256:fe1ee217447d5d9765b078d67183483f0cdcf0a85229d77ab39a843c1ef7f713 size 702584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3dede3aded..a8f7aec67f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:519bd4da456b363475c05e1435673e7f79e810c216602bee3c7b6466408144a2 -size 620096 +oid sha256:4b5affbfafec5bcf3e519858be589466e8d12c55d668bf14981cb0f5e5a71331 +size 609241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0fe4549395 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37f31af3a261a65fcb396164c15f38875e6dd0b7685be6ac6e9db53b147866c8 +size 789288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fb6caebb23 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6096838c84744e3cc2fe8fa762519e7c7d2dd288137c0eb01cff7b157e8a944 +size 568321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..01f349cc37 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d6c0e9f407af5d7cd3bbcceddaa29516886d54d928bba6390f36a7da6a0b2d2 +size 625450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7d137d8b57 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da53bdf3b1b1de9f4fab39559f115224d30bd91fc44d934204075f4e6646bd86 +size 726682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b22e9442bd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cace347c5519af9147c2839ec6e764d73c00b51b145a8d1ff18183cf7102d63 +size 544689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 97f3ffce2e..340d33067f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9010f8ab165977356b036f0a796e7063fdeea2cf0bfe961424987422f5637a3a -size 714912 +oid sha256:485a4b8028f156fca31719940ba859c9a75277e620667dc4075dc345974e5b21 +size 703960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6334289d0b..17d61d325b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca7d3d3d27ed1befade9aa2d7a73bfa0c8e80fc269a9393cc396ecb33961ba95 -size 637408 +oid sha256:506ed345adf7fee3889553879c6c8e7be8bb1ab171a36fdde06f98369b3cd90c +size 626850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index f79492a171..fff5a4271c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88f8a27c8836a86ad87115baf0cedc3f36fe0da4924d3bbe4c2b1038547e2fd3 -size 698482 +oid sha256:97dee9907a380c1e03b37e2f77e96a1ac601ae9eadf3427ab3d4bc7ace59ba85 +size 685064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index bd1ca2559b..a128140bc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7364bb55d24cc4929cf8eddbb700a1a185234c187fb37f15c01cf4aa6c9a3c3 -size 662074 +oid sha256:fcae84079a7fd3a6a808ea64601fc00947f3e0b6cc970b5f2ef63c876e0a6eee +size 649494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7fe0160d1a..85d64945a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55a19c4b994b72bf4ac2ce1a9413093ae67a6d982013608e833f6c80b08c5128 -size 613379 +oid sha256:711304a1f1b8d9738a64f142c2212db07c516641099bd52d1622b71f3a03c972 +size 600997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..819d17f038 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a82740e91ddd1495e6cf90f83aa2471994c3d320d412534af6bbd53f3f4ab06f +size 729984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..654f0550d4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b16dfdaec7f81c357dac5ad4fd111fb6a9d10843d2d80adf1eba4008b64b2bdb +size 558843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..073a37f0ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cde09ff471b60f393023bd698b86598a7e7ce5c7b253efd6dd26d85eb93e64b2 +size 608571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..af6a32932c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:439c4870fd0f1089e375a8daac3100a0212bdf08c35608156c21da0997675f01 +size 672014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1791d7b459 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6c5e6e0793ab50a6de0ea7acd1750a7a522de7027b12b090e4f61666fdc7538 +size 535999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5b54854e6f..3e2a320d0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1851f0c799f099a9321d34cecbc9487ac6fee15afeefb2415b1dc47a20b9b69 -size 733930 +oid sha256:89ee7e0c6c0e54c8dc6b9dfe8de58bbff9a7918eb821ae7b75bfc36dd04da827 +size 717058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 0707243b2d..d0a20ddbab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72c845f38e80b16a54901b76ec99523f607e7ef4ce3878133317bb3ba7c0c2b9 -size 724962 +oid sha256:70f3ac0ee7f8df741fe11df0c12bc38fae549660cde65517ce7ce1583ae86168 +size 706166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b7c716a211..f3e8a9ea99 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:286440683548c1d99289241ed7436c436131c8b69719c88dea53d9250506715c -size 645762 +oid sha256:b588d115cc2db1f4220079e514d722701fe0fe3f552003315b46fdbb246bcb86 +size 638214 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6ca737a0d3..6ccbbf1ec5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae3f8dbede321c46c94eadac4736a4c9764cd63a912cd7a013d84b30c3fec725 -size 630876 +oid sha256:d3034285c4860ad11a997847032ec7762ecddf914a240b1940e62557a7c1b749 +size 623378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7e217edfcf..2a4b870726 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:628449577d68bb6d8153f5cbb0217be2ccd5fb9e0249d486cd4b0a65e5adc5e5 -size 665268 +oid sha256:4804ad5bb802a119141dda0f355a8f8225d4afacc292e3bcd3655ded6b700401 +size 653476 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7920330fba..8f3d3672a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2c23ae3dabed38a5181bf311228f93a809efca23422e6e59128b03795054033 -size 568219 +oid sha256:3c0848d2782a67d367edf452a0915d2c5208704f009e6a646cec0654ae562b7f +size 561067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 58a0801805..5ff665a098 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25c840a2d2215c2a12938ad2df5c91733b79f1c9fc7832692b72fe3f203a795d -size 729302 +oid sha256:dee8ec217c6608605f4ac40c9d9476e3782ea46afdd452d2f22caf3d593a06b9 +size 715784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 4e3a8b6c10..c2897097ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b300a09010a6dcc40572a2a906658974c51f6c3f4abf6651f15e93e0753e7901 -size 639212 +oid sha256:88a4b5cd293e7395df91996c21002d104d847cfc31b4c38ced868298325f79e2 +size 621106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 6a788bef5a..f4ca2c19df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1de97d942b655425e02a8fef57a1d2fcc46318f7c9c5d1331e8f8c8758d759e0 -size 659692 +oid sha256:ef084715d07e023733196632556204589ba3602c9a79a51e8b695b982474389b +size 648592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 620ba32bcf..daed50f4a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd8abe0c795bfd75569750c3e1bfe74d1e5d5b9be85ee320370e9dfc8bad5a79 -size 570341 +oid sha256:e427a0c92c591e6ed86dc4bd9ca98c0201ba942e8638dc7f7147dad7bae01156 +size 554899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1efdf0c693..767eb54a0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd7097ddd72d21c923c53d34fdc26511a514ef2740f4d530b6a066069bbcea36 -size 620076 +oid sha256:145e3be264aea3dcb33ec3b33bc537e2cb06b356360d6c68e10766c75ce473ed +size 619238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9831718a7e..dde0bdc6ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d6edab53cf393d8d1d7ee85b2e23f3a889c8983702c7d75660384d65aedf9a5 -size 533881 +oid sha256:9f0e17ee25ea792a7eb2ac07f3d76a01aa8d96cdaa995cf7ba6b05ec1f30c383 +size 527813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 85ccdc8e49..2f03471a21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e68638cfb1c2919d4e952db822f532fc3ddb719ecb7c3101a058592a8b5dfbd -size 759606 +oid sha256:cb97fc6f3da3d534e2e110600c873494073145dcd8daf7cae065be6f45cd430e +size 741354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0fa3a4dcb8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdbeb3bd767f94bd8d54e48b30b2517a36a2cdb43446eb70d920d992a3848c14 +size 733324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index f7716067a3..db12fb330b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88bb6a01ee9a6524a73fc9b74383a8cedb4ff4548600fc4b35cae50c5e063599 -size 669812 +oid sha256:b439815f2d92f55195c2cb36c4e7ae35ec221f5507234ca945760bb4453f7352 +size 660784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..843633b34b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e58d35b6903cc937e6873c3b48b3eff852188423f0236eb143896a421e497f98 +size 647970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d6f9f2d770 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddce6dacd014742f8676896a6dfd0af6127b595d98c916050c0f9cf87ae614bb +size 589613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..871b2f1daa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d14511f0b20c4c3af022f95876b14e3db6e4438c16a11753af70cd2d37f82de +size 495871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e409c19919 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0be97893fb11489d281ef2a2594e5cfe1506a7b25e48613fd32ecd0ed357411c +size 657644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1a6786db73 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5581090fc313d5f5d6aa4bbe18abb621f2a6d4db527f2e8f7f4e5906c598d5f +size 552605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..eb85e053d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0aabe950fd0fc41316af730185007053f5c58f34f5261c9b4a0c233a162a689 +size 672740 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ea17dc1624 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:591dcb84df2682a58219ac13b461dc8245d56b6d5132b04fa89f4f9e6aa0e5f7 +size 586103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ff05665fca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7563597daedc009fbd52e4c49484c57ce288530ce3d176f3ad87b6d7076ddd8c +size 560653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..de26413315 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ef2afa232c0a2ec18a029fb2b4860cb7ff507cdb494ddf370e7aaa52c43fccb +size 466517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c4568f6c58..f900099da4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0635c3a78d4d3d3762f9a6fca65c666bfdefd96d0c0b70e4ed03b3b7acb25a92 -size 773082 +oid sha256:bd557e47500404f21aa01a2c8091e4f5487e00358a0ab9d24940354d1b98e8c6 +size 760898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 193b891815..ce33410ac1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bd02e029889c5949e1ddc83ad7926c860bdf8bd8e42c78ff22c3edee29c3f53 -size 799598 +oid sha256:4405ffbc391c51e75bf36db601f661514b1b4eac8d350dc04858c3e42feaaaf5 +size 786920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 0734abb466..27f5522cbf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33cac8af467eff2bf296636a08d80bdfc74ebdd5f142b709419720976a68919a -size 712248 +oid sha256:49e8a2c399326bc79cec5591f34f72fc3c2ada7288ffec8172edfee735b7db4a +size 702136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2663d14ed5..4e048afb54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8eb55e0e5406b68697c71205855a7947c287c6763b0d202cccee567fb8fb2146 -size 739356 +oid sha256:5c4e099f7d5ed9fac5a5c308255151d65fb0e549497386eec0a73e1c1672929d +size 728948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 1bb38bfbd3..8d4cbedf77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14542bcaa81c78187c1ec084df077108f53f24a4474ff33cf6594804f591893c -size 707598 +oid sha256:ee62ec9be7c90bbc20044ebca97bf0478ca0b13c5db2b48a9931267e5066c54c +size 691022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a6666b553b..8e197f1ca8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be5683d30c4c4741178a0015122bd76d9a645745e396f1b4dcb67c732d837677 -size 616321 +oid sha256:a09b625b5151b39f376cc19a41183ac9f7aaf97ea33a503f9e9c93acff06709b +size 609021 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 715e696b5f..77e1aaaeac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36ffa2f3a1248bac02b8b9b5eb04d53f767183864d1808ac2af09633c8e79fe7 -size 735642 +oid sha256:f6596b3954fd4081f59b6a6348d1c1250e489f4070866d66489d992725f7fce7 +size 717438 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index dc89191172..b13357881c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:148d6295df91759f63627f48b4242bb62c36103fa72dfe0594b4efad865c360b -size 643480 +oid sha256:484869bb394f582ef1a539d87b264b6d91709ff206f1ac1744195d35694ef3aa +size 635094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index a7539f3a7c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d484dc963de1be08ae7183c9a016d8475814e6aedb9ff3367729f56875f59eb6 -size 567185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 36f8868d83..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27a4ad22c6b862dd43b3e7551e98198a1c8f3c56d5d135477bdf4bcd2037c701 -size 545623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 8393b24353..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9163b31404711092be4ae104360b45342de984438dda56f168c615180dfc6c5e -size 557263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index a40a44c476..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6c78ad0f9fd48ec8e82ba912b6d72aef13358c1050da327324814657730fcb34 -size 536885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ad250a479d..bd08db0cee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:208cb927cfbb31eb4b0caf1160f7df37b20c63ca74ffa6aba8593d6b9ca5533c -size 730868 +oid sha256:8fb45bb8370e272ca85c42f58fb24a5a690eecefa008b1cc286bca6fadc0167e +size 716216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index c35db1e100..1400b95ff4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a679b2d2812f3ad544bf0cd262b64e1f9d9c2cc4d52b355e7facb3ba90633cfa -size 647586 +oid sha256:320e72398bed4c895e9830137b8fc78a1a688eca5dedca47ed38d9679a43fd39 +size 639544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b75aec5cfb..e6c0582a48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5158574b082fe2a19a0d60f4212e0b8fcd2269552ad249af5c69ba100e1c71af -size 757730 +oid sha256:6039465be2922cc854cfefd4e78c18c3d0ceeec1647be16bfba52b8026a103ec +size 741252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index a90cd8509d..a529fd1fb7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:712abc324c715757c08c7f1cfbfd7730caf0bae76d519ad521a2236b6e405199 -size 670600 +oid sha256:b8cb6e21c333d5011309346e115211dadb60a74d814ee77a2547667aa30a6572 +size 662854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 9cc03c9892..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0141910df455247eb36bab31c5519010cf36d43e7481a2a6a52b4b909e82e309 -size 587637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index a3521405b9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f6974ff2a3d8a4e0697346426b48e70c405cefbe355e9fe165913f9116be6b54 -size 484573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 85bf194f53..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b1e3c2e920e7cadfa49ce232c1b708412bd85015928d0d92d343006cda2e269c -size 562131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 165f9981a9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a5c8b8959f7745e6af1e054632a08579a61d328dfc27b8f638d5b21019ee9f83 -size 461679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ea2a98d098..400e730cc2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e500c45819fcef7763ddce4044499715fcf730a1f6a948ba1ab14adeda28ca27 -size 829108 +oid sha256:7b233a8f20ec6c9a87790897ea297bb4768a09bbe01874c01a27cbccb4c8952b +size 814110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d4264aaf2f..3aaa356291 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fab1dd09021e7ec1f66aec326511f405baf146345f393c64e2f9ed22f02b8d7e -size 675284 +oid sha256:7c659aee0efeac539a134cd249001c3584deeb33dd6b841643240751ebde0218 +size 666108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index f244abbbf7..85811e4097 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97b4640346838e133f9d90dad369a7e9928374113f4a6714ac56d8685059658f -size 744006 +oid sha256:120b09cdc65bcf95a2f568c7fae6da4012e4b1ca39c52bc1d75154ef72730372 +size 734238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 2fb17db6da..6ad08d5365 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5175c680315468312d65259c378122a9d3a771d1de0715008f0021b4bfdeb9ef -size 737692 +oid sha256:47fa603d97e821af39b97073ccbb1a2f82ed01e66a2cb08a85a1a8970bd214c5 +size 743612 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d4b08bf6b2..cc3edf871d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d816112c97a90524972e9ba21ad6e16d43bc5b368a174c1ff4c79e565a86482 -size 647360 +oid sha256:1fa1dbb1f1c3f61000dac4504981f770b593e8a2f9e415bb4a32a1343feae379 +size 638826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c0a0760428 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeecfe3e3b4b2b0e21cd782fe17dbae5447555944d069425bc862d0b2d080840 +size 840578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2302ce4e09..1119307002 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bac647f11f1a7a2e88c46f6ad20823558777c280dabfef4eb33bbb7312b54b8b -size 593463 +oid sha256:f41787a091ca711692613d656b5e20f1f7b9f0e0db057f7589f47e7348e39f46 +size 596177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4cf66b9d9f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8591a97d0d111a8180841c51ac3c2beec6f59030ac23f21bc3e14cf123411cdb +size 658880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..19c163e59f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81e0249d6bce4cde5300d470436a3846ee8b62703cb89dc49c5c3a06e3fd3255 +size 766132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index a584c2ac18..07a3ed86ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d5b1c75297e957ff55460ff4431b5caf78c2094200ea58895eeb6f27925260f -size 570275 +oid sha256:492dcfee01255d457ecb4d6f9c2dccf27faadf9e83633b5fa79a7dd8983709dd +size 571261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a53f9dec37..9f9608df17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d84c92bd8af6f75d05c161ef37775b99a107402e353331e8909bb46d17419fa3 -size 767534 +oid sha256:7a83c44d727f6b2cc3b9f9c097eeb2ac1cf7f91eee5aea4aa317d8a8b7ad83ba +size 754806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cef8958457..b556f61fbf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f948e9592fd9e96ead5f9415972316bcdad492361975c6a387226f2f1506e7b -size 666250 +oid sha256:6e84ec8df8bbc393656e574ae9f99d98a845709d854bf97beadf6f77f0e7bb50 +size 657026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 10bc86bfa7..d82d6c5d70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:822286b7329499199fb57597320494de8fc8948bec7c01c85f440cf8d61ad1d2 -size 732110 +oid sha256:6e9c8b16070dafbaf35fef165a3c8aab56a8df354fd249744357b51abf94346d +size 720270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 319de0c48e..25127b2556 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:106e366b42e9bd7848203fa463fdba7e9d4bb1d1fd8b76a5032ad219e87333f9 -size 700932 +oid sha256:6dac76c3cc482e200ce7cb68064c492b7d151d37f2660fd830a6a2db646b1abe +size 690522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a07c6b9f74..3325246964 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4553e21bd775d9f84a8214f48a3d855682b21cde96e9de0eea6d3cb73cb11632 -size 640596 +oid sha256:a993598ba1bcd93e7bc589fa97a620989c529c96b338fe5da35fa6532d693b9a +size 630580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d53d25af63 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ba9e3e643c0690b209354d9eaf0bb06798a34aeb3844d363bd536004bc96e58 +size 781272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index edd420eeeb..1b5eaa8008 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9ec871f29a2c21959ed5b88789d037915e0ea5900456a117fc0ca0c4bd07922 -size 583541 +oid sha256:eecd2bf5c63cda49077a879c2e368394e705ce8ed290fcfffcde28fb93b96c5d +size 587489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a0f674a9ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db5eb556c9188f49993279d0dd50267ff9945b4c91c5031ed8b0ebbb7fe86f03 +size 642002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..43b91758cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d071b03d8bb831391c196f59825733b1f8e668a0b7b2b2576dbd6aa02f9aa910 +size 713092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index e88a9b0b41..ae0306d8b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abec1c0e084bcf1d06c9f0df5819cf001ec67feb0039db597670d5bde292dc8d -size 562375 +oid sha256:371b7237d178b87bb1584924f8ad91b2b3f8befcfb112dde6529f540b905cf5f +size 562573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dbd7091ef6..854f098b03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:112eaaa2d58481ea06eaaf08c1732935ca0190b085aec022cbc00a16673ceed8 -size 805100 +oid sha256:1cedcddfab90e9ca7ebde826adb9d30e337a071800c92bb5196353e002af69af +size 792914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index d43f32ea39..5455c96793 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18da96093a1d2c8d755dbaf0dad9367f2dff2b34ddf6b43088792f319bddd2e2 -size 773786 +oid sha256:cdeb4f20eeb871101cdb04b3d76632dd7dab0705b5e5257057cc5c761a820394 +size 752818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1998878dbe..ce0ce1d81b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:600fc9891ccb62137ebd0c9afe25ef652c8811031ab3a4207d6690fde4f8770d -size 715996 +oid sha256:244990d58c040a5c34b005a2b050396b73e1908c00350f676d3692b0f3bbcdc0 +size 709386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9554ef4f5b..fa465d99c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67777baa4e2dedfb2a010d70233b8d98d3ffb54ce69e3294a19105c92f19e5f1 -size 680142 +oid sha256:4cd972c8e9d97eaaadaeb192d35667d1289462a41e4bfec26ef5c16c1a3d5854 +size 673630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 14cac7555d..f8d553a54b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:819fd1c185f9fb75d9822505db11a5c1b91d6fe0727bee76b15687b69b632b02 -size 693024 +oid sha256:c5096e831fdac242d7ebf276f772be6dcf4f3211eab85f707ca92f77903c6e5a +size 680988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 62e2116754..99c5df0521 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7fddb75188748455da367dee03c793d43659264e58b11061d03d91ddc3caf75 -size 595731 +oid sha256:243ba07b8784848ae328dc90f39d9c4a7b784f7b38afc22f53d8509ac6257c4d +size 586899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 63122d0814..137a159d78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0762f1e5eefee73e2c4dab2de87147166da374b063841bcc21396e9c79de20a -size 762486 +oid sha256:0a0f536a3f4f05574b10215ab3aed660e9b29433c91ebea5c060d94d1d3ddabe +size 748722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 3bcb528ee3..cfab9ab3e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7128c8dc02cc94a5da5e95496759119f76cb54cdeb296d890433a0306eabf65a -size 671704 +oid sha256:8fbe2cee20274a5f2055279492f3cd6f80801171781a48d65513b72eec863c62 +size 653500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index be59539e39..1e2797f6dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74a8a4c7f59c75f7cc1036a9c6a990cafdfaf4bb264fe47d227bf8e9275cd1e8 -size 696478 +oid sha256:637529c79f987b724d37d24a0a89dab27c0aea0723fe486f5ff11733acf0fdda +size 685724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6a4cc39786..e1cd3b1441 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34bbb811fc5f4fae0b44b91baf36cefd41d051ff223e1c1b44a72405a5f0d58b -size 608951 +oid sha256:d15b7e25f249b75d0d491f78f4778f165a916b006918ea1a925acbb6a17f62c8 +size 590057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8e68204a04..6541cfa884 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1ed4d5a4f11d959ae0ff09d576ff1fa88c537f478df60d18b35060eaa3fde34 -size 654986 +oid sha256:ad506d2087cae22e0c4d873855908c8a08f441ab8a53fa4b35181eeabb6cb650 +size 645218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9b367750a3..e36a01e661 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90d11ceb50e5fa0ceb4e9189958f0099b01078b97198762f1dc25c2711032a8e -size 558333 +oid sha256:d5e1b53dfa82bfaa49ed65b3259e49b2cae40459adddca6813bfc22465b88289 +size 552265 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b1a4a9b6fc..f923c17e05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edac3d624c83a175e9d1e0d24a18001e9418fc36eda6d5c394cd5d4a394af168 -size 831862 +oid sha256:efb3f95315ba879bb6c5537561d7f40bfa81acf1003bb42ce442ac7598ade87e +size 817162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d9ddb6432b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3764eaf7150640c68b0f0813774fa8ca2b1e55c6a27344d8c45abadd4a8ad08e +size 779926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 7b28a7835a..d4fa32cc95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b107bd500386a1a1faea7b7a9b6e3b066dbfa2ad18e4962a247db1a62dcb3511 -size 741278 +oid sha256:0b6ed6424000cac1edeea4415ebd62acdcadcf25aab89bc8eb830717cfeb1394 +size 733978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d4d5388725 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78b08c3f59fc182cfb1365142a02f344cd8e9c7c17a253f68c839e1be2579974 +size 698914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 176bf208a6..09f8c46370 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f1bfef2720ea6fb993e758c6170f48aedeaa1c03b2ed2e25145f503965cf5ac -size 612289 +oid sha256:e7be29d1256ebad387fbb0ac034d3a578c35596316cfbdd43b030d37c77dd7d8 +size 627238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 91fb9ace69..aea22a4076 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:630dab832c05fe5cf9b11c469bef26a02eca2d10d6931d6b721d2d2101b04a7a -size 508287 +oid sha256:fdf03bfa63ab087345444794537e797d421ad361ccf1fc619bc390674831152f +size 524467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..42d3f51809 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a761138e72c6533240eb843765c6553dabe65eab04ca8da5b9a407907351aa08 +size 691914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5ab7494d98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c244518a045e4ee861e8fc86c9211748d5edde107b7884aeb5a398cd6c974fa +size 586627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8bb7a2fc66 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcab1cbd3e1a3b7df1eb03b99ac053a15952a40794a01fb2687b97a6fe871621 +size 708292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4acb603477 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceff221855c581804ad1a6706aceb14be8d3acb958199c7a0ddb5828cc764908 +size 623678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7fee964176..e40033297b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5ed5acd6d92b1c5e910e5caad62042fe572be2a667653427aaf3ebc4f414b11 -size 584809 +oid sha256:be128f0b1d9d03638e745c66a7a70a45ce6982f8b486cd71ab5037afc9d023ad +size 594873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index ad1ea2b1c3..4aa0198636 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7d1cf7f0533854d61339b104553c8f3b64f1fe4d29676a6bc023688ee68c6c7 -size 483765 +oid sha256:a43ba04800b26c846ee078a9168dbf08b61fde1ec7e13943e151e80920890267 +size 492941 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index dab08c1ed4..8ad56addb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a244ba7dc85cb201f854626f366c27ad43ad4d06272a81e4fd7292cc3981f2cd -size 693476 +oid sha256:a78418b426b0921246af51ee359b1ec9b7f943639115a9ae45e84a10de9a34d0 +size 667922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6ab4e9aa3f..6806da5be5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c239bc0cf97e9ddb791cb675be1aa740875b5832185ec36043c5de92539c57d -size 609797 +oid sha256:79735ea65272e0bd374c41f029a265d67141712e0c5141174e3ac189b87659a2 +size 587005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d5db8ffb6c..a5e63d1a47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15dc7bfb5dd26f4f952578ea9f3ec88a762a9b120bc7000cc445f78af9a6a71c -size 718462 +oid sha256:63e101be3a3b3b3c006f804c1ad3c557bdb9370cb2d9eb93c6823f8de530e356 +size 690540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index e3f8891b03..9edcd6d45d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b09a22a596762e836d8d84aba466cfbf7e5c3d112e97a562423b2a202682e788 -size 633256 +oid sha256:559621f51ef2b8c498a7bce49312f76c9d8d9aa8c61b5dcf120db69873221784 +size 607405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 86d6eb783e..b3bbbeee79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47c4b5809baa13fccf685d61a5f17988da3038cd912c0b01b32c2d6cda0d35f6 -size 699196 +oid sha256:dd627c9ee53aae7563ad0f60069e99a42dc6e47a0c569c640a2afbb29e319469 +size 669744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 2db1e59165..6bd7c0f6ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7139a4137c9197e3789f28a485811b29ef92c9ac5e6a546f4eb5d64bbc6c286e -size 611275 +oid sha256:fbe424de142c15fff214c9356bf3df54fc34b2b0153c07400f246554689de43f +size 586955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 346393dea1..84893158c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd83b1991d2d4bdedb66590b0aec09032897ed010507c7e5ae6b965a036f368e -size 717080 +oid sha256:987b72d5b9eb196b6e64a9c857303bf20eebec06b06fc0f6033dc7447f9ef6bb +size 689848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index a7a1c2b29c..e6884a8fb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c132c58718f3f28b6d52f93d53d0a0dc24f2583d1680513c00da434e68963c4b -size 633500 +oid sha256:84956128936f2367199fdaa02d84fcecab38a9607ef26fe60e874afb27cc06c1 +size 608685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0b37b53aea..cfe64047c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afe0d0c7fcb1b8977f99f18fcfd45ce77d72c1ab2175c3eb5ce5226c9a889864 -size 761884 +oid sha256:265f7e07987f6df89f86483bb13aed56c4d08629831f541710bba9cbda946cce +size 736478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 0572a0c42c..f36a90d9b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acfc87e390eda4a2e8da9ec333b3b3a51d9b2f6f242a0e2e5665f622d8266f8e -size 680574 +oid sha256:5653e9fedde641b37022698c0c78e6d6355c0994b21584cc4bd15af5c3c91c92 +size 656006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 65a73ea72d..33f413b126 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24e14d04e6f7c4ac96d993974d1e038ff9bd18ade3ea380240fd49c220a4a814 -size 786624 +oid sha256:ff461f6d11d3f360eb09afa6a1f068a896b4ae32b1ea657e510e8695acbf34bc +size 759490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2b7c1d2122..30060de6e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b96a5295b58db98d503551ba2cb1abbf62f091d3d232bf16e0776567ed83386f -size 703884 +oid sha256:14e265e55bf000abc915921a3dafb5ea2cd7144b9c618b7bd2bb114e0cb23529 +size 679810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6e6ee94f7a..f44dc32aee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67355ff2ca6bc38b6110b3bf67d5658f1be556f8c6b24aae751c0d072af9d3a4 -size 834998 +oid sha256:5454e76f7bda5226f4828b6811aa7d260ae4b654935c39f3957f3a0090f2741b +size 825970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ec5d586c3e..7b0ab8acc4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d86d01a9432ee076efd74ac5c72a8a2cea87158d792ed15f05326eb53741923 -size 665930 +oid sha256:c4c3edc4f33443d835e6a5251a474eac60ca3249a281a558402bb756855ff5ae +size 655324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 1b624d52d3..7097d0daa8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d191f268e6d4773e2f8b951105e2958faab9bf6e666c7d646e803e421cc1f889 -size 736478 +oid sha256:21b798fd63fd1197653391321c60a57809da74ede8faa42804b8b01a85147c91 +size 725130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f240950530..de6893490d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfba98d0eb4bc990cea8c15d37ffd4bd50ae1c73934eea9680e4f911589a50ff -size 738550 +oid sha256:ff9f0e586431592ad563ff0ad6d2cf0adb65af29982ce60f8ee24d4ee333fb01 +size 729620 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ffd5b4a894..795066555a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:736f836fbbb2e4e91b7adae168074b5ec206378397150b382542721149ce76c7 -size 620196 +oid sha256:652fe61dbf1867075203709500363d421ee3697e3c15350adff358432931d92b +size 610871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c6d1179a03 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:017da6f0320092adf71d34f4808b8b31dc32f50149bb4f3e8148662a76fab888 +size 850316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9a307710bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ff4b02ad1245d325afc0df52319b9bef5a170e719f4c0663a5f4e3dd2f0b2c8 +size 581693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..30f5f8adab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be048a1f1bb761953e749efa31deebb4708c7728b825a101553e1e9e62b8c404 +size 641190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..23ca48a1ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98f6457e26b6b76cfcfbc70f637cc20a502b0ca7dbda6d842157d1e01ce608b2 +size 752978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5d4bd64a17 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8b45589e2e0d60b07a92efeb1db5b3675283e1a895638b7d7b49d18f496e0ae +size 542223 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2cb5d2d3cd..18ab6fcbe8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e811a1f4b5245adc5b7670ede865078e8adfe059536cf07c446442010efaaeff -size 803024 +oid sha256:ddb3ce24ae34c390ff7167a6fe4c0b090fa662e03f0b28aa63bf17666e10f385 +size 772634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b41d388497..86dbe3a2ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ec5f4745c0ddc92e37c490604c92f9c029e3486f53dfe92d3d625e947579777 -size 655614 +oid sha256:fcf049800bb0129ad5ce49a0322df0554d4e8fdece5aa1ac1a74396ef2988a1d +size 641554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 4a1db6bec8..1159cf99b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:328478b5312901251d96b092a8cf0ba421a951dc583c57049cca4864a0c2fe2d -size 713926 +oid sha256:1d74d2bdb37c68d1a50a0d37f9b0bd9df8c38ccdb9a460e9ef0e9f342111022e +size 700704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 026377ed0f..f06efb10cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7010f2480b997013f48bf5e25cd1e155384072785bc704919e51491c0ad59b43 -size 680576 +oid sha256:3cdb569983c2feffc1aa031832137d84070f68473186e6d27cb03a3ebd4c90c5 +size 668836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 09dd741997..447966c2f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:815afe8ef42d047619c247b0fe94354d46aa9643c4e43acde205adef75ed9dd7 -size 614269 +oid sha256:265b5ff5cb0e89a46593a73e94b5123c26d18e978363c1e504430775c2885c46 +size 603417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..085c0c3476 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a604475d87db0425097d931f4de9614c1e27de3a385ac519a647d12067340ac1 +size 796980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..16aa751314 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb8f10ce7f2012a644ef86d07e2796b7c87fe7ed9d883d79b6dee14b8982729 +size 564815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a8d077afcb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab7dc1a07b07544e671ff855300280c2b4e71fa1cbc6fcac479b63d1bf39a966 +size 616663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c8c56700d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5466d326a3db3115ad1ab9124396895d39d1520c308cd919cce078678f36d5ee +size 692786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f2048d75bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e85e1ca684190647866b6e6eb1509cc5e35f97ce025872276cafed3d5e35aa3 +size 532745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b69cc0033f..1eedd435f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69ace207004f4407a1cc9c978a24cdc21d9fb76f91154d3649c6c7942e25c8d7 -size 777986 +oid sha256:41757740e7465a0662679d4c321dbb2b3702fbe37b5cf1a92de6da3ce1c5d86d +size 749076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index ad7059cf26..034401b2d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a226fb72a09dadefff97199bfedb9b73addbadb3608dbd53b908db4fce3613d -size 787026 +oid sha256:f6f1d9b333409eb3382a05f92df53bc3779bec710b41abb1553e0a1d2ac7fbf5 +size 760682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 65e13344f1..ca90658fff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ddd62f2cf689aa49deff460d80f13156d48ea563576a8c80d0d38a39ede73fd -size 689474 +oid sha256:9f552af92e64a39abe65adcbb489d2c57afb2df7d70231ae7f913a4a14e78235 +size 665054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a38ec8e3a9..5fb58c3f9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e394696b9eb19285ec07155ddfc3d76734c35e24c31a5071f85f56bd2c1828b -size 699402 +oid sha256:635cadaf554cb0929374fc3725c9c5ea95eb19ae860b42c16f63aca8ee6fc678 +size 677646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7028ec3f7e..ff31d466fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:132cc4325a41b4e2daf197067f1e28f69169e9c087ee488490993612e71e9276 -size 680514 +oid sha256:abddc4c139d4e854c91b978ff3360fae1ec676c49181571e9755e73a897fa826 +size 667094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 44a7291daf..369e7f4ddc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01b17762c20ec0dbfbde025fb0e7e4ff18d831235970600a257329f893a71ecc -size 579075 +oid sha256:56a6bffbfa3d39f65edf6cfcabe12ae30b3ab1bb1cecd4e54b40eb78f2fbc033 +size 570985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index af49a29780..a963cef6ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0c8b253486f17cd0a358afaed1ce92c6798d6bf65629ebd059bf83302b8b912 -size 762850 +oid sha256:24eed0b53e30c7de2cef0bcf92d45bf298aa173b8e7ff0b251de30f48331442f +size 748050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 92ec303d40..88f84b57f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69404465776507aa574daf505ea8821e978b290c1a186ad429fe4f91f355dbee -size 659934 +oid sha256:d37539d37d37212a4372230b4079d006d9e447ad478f203f04ace90ef3090fae +size 649376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index d340885d30..6dd8ad3179 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eccad437a03b5b5c36715f2299fc8f612cb0ffad30075cebc7dcaa5d3daa921d -size 681550 +oid sha256:dda619ab846667b24b314138b0bd30543e600a7e76e552c4d59820ed33b62e72 +size 669610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9ef0135ad8..3089e428d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50199875b7c5221196cce495481c8cb5e3097c33011a13e9e12ff165d7316716 -size 592839 +oid sha256:cc25fe10165a7db067b3d889828e3a8aae23a6ee8fb6c40b8fa10d2da66513c2 +size 585291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ab31cdba60..bc7bd9d438 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1380eb11c5ebcc87ee704ef60316d4dbd65f6cb5112776b72d80d7da3969534c -size 620324 +oid sha256:ee8882d1f4fba4a2521f4861f586cd82d9241afd01474675d05392d46981fd94 +size 620226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bfd390813f..bd14b12e0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92e53389dbf411c5121ecf58a798c60133764bf9173200d47ddd959fae42abc2 -size 534919 +oid sha256:457ead81a67733c169fe1e7064e7d306c62555cbe605851ae87fdcd8373e27dc +size 530381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e18f5b2c43..c0ae8c9f23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8e070225071fb1871396df08b8eb30e69d17fe9669db4e1811c0cfea697b582 -size 802776 +oid sha256:e5fed30bb810ad75191aaa00ef82d285f0bca2cc6520ec4f1f865cd984cafceb +size 772336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2c14b49972 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c03ca2447ae2fc0f32909203e760f25450c8131337ba822def53fffd84857d2a +size 786162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 1a56fd79bc..098dc0166b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e45f65e5ed72d8d6ab283d145f53073877dc2d6b4a023abf10725df7a286a974 -size 712142 +oid sha256:795d5a66122fcd4322c5d7933a7fa39ffc981a1f3d9ee5c5f58e1b0efd0e2dd9 +size 686982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c385e27e09 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dcc7353378c718df1d8f779831b4521f540cc7cbc3d83d0e87aa320d37caaee +size 701252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fd20476fac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffd61216bcddf6c5acc98d9408929878874f4f74d5d6303c86ef38100f951969 +size 604761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c95e62e73a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d43c2eca8aa3efabf60b72506d6d7f275c3ffc28dc8e8c0d24739535ca81ca0 +size 500017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..336a45a5f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbf0287fec71afbf6b4c1db144a0369c01343e2220a0ea283e8a486a85b9d4ef +size 679550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d77afeea29 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939f8792ea24a930fd5ec328311b3a63d3daa6e4ab82eff4e59be06a79abc543 +size 566815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4a06c01c09 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e13da5e3e067d67303461f552c00b567fa49cd81021aefaba9b498a974a41a1 +size 693758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b5a41221b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fb683035f5312eb0fdac19481944f6e9067969462ce4096207a5e712b882b89 +size 608699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..48cdb432d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c147e7962e4d0848c5bf3321147dc98449ae85a903a34ae34c05d243e2809c42 +size 557349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ab4d7ebeb6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c33d0cfdd34a8fbe3d45aaadb1433849ebab3174c26bbdcf7977bb55be0a3f5 +size 463411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 576771766d..a40ff7b783 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fea6123bf27bc904e43df599d11f91a41b426f4950388b5d47cbbc5a53025ffc -size 830460 +oid sha256:c62b714b7944a51bd4f6928ab4f8cdbfabe2c03addd3b774bd05afd569e5be28 +size 822220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 0b91074510..2e4263964f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2d216d9857df67c8e5b6c535e019cb32abaeed1eb49785a1327d97a159d3a5c -size 855988 +oid sha256:5363be8eedfda57fe31350a276d22202d254253bef48b6f78f6839dd171d246e +size 846664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4cb8a59abd..9586e352e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a7297a60a90ebdd93889616be07cda49a0874a07663e480a67fef3faeb6bf06 -size 795920 +oid sha256:cc2936750c7b7daa5aa9c134c9fc8c7d11840437948bc4a74389b7e1c20b6b41 +size 772438 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index e85b3ea2b9..98339466f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc5b87b3102d67477a7ab285ce220ab242149d3a83223a6a4e2deb43e8bad4c8 -size 821450 +oid sha256:1e225aac97dc4d9443dc5f7be92d8c131e71407fd71a7d1386cfad4ec0dbd590 +size 796882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 976c8c7587..0924c38706 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d87e7db044ff58563bb8358f600c741401767e8f1d26e6555e78fff5d4d5f04 -size 768970 +oid sha256:67fe0025dded5d4e39293772f1817c52f8a6a365f8a08e31002179306d1a6ce9 +size 743316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4bfd2681b3..04fa64802a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:186015fcaaa860e705402f0502fe138e0ff5aea362b07ce035eae98050eac633 -size 685292 +oid sha256:895757941598c40042292f0a69a4360e48bcfc6f8192a65a3f8799ccc0913575 +size 661760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index b19df08981..17737c9e29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18062d6b856e3d2c627ba9bf50cd23b335b548bb6b653c10478dcf70fbfae0c9 -size 797064 +oid sha256:9f907e46039e022afffbdf981de0f3743749f37438ad478f2e872f209af7bb07 +size 768994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 94b8aeb143..2e3fda43b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f72c3ba101efedd5e8122cf18e632f9892e4af77c3c37e7c523bf16bc7d24ae0 -size 710872 +oid sha256:e0af7b55e9c242d88a845d7944bddd031fc65dbc512931cbaf00c75cb32d61a2 +size 687092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index f9ff600d25..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d4bb43875831dbfa557a52cd1ec0ac4d6797ab5b260cd25098f2309cc3c21088 -size 580407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 070f3329f6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:585c83087bab666cbb291d2a56d77786d347032a596f61e496852d1bbcf03fbd -size 540741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index c8db54e864..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc8055eabf24450ece5866e02940e285b687ee2db073347791a74e0e297e062b -size 566341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 281a070afc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:86135e5b86c48927ccc95ed63a218b5751e78981add0fdac760b87b13d84738a -size 532743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f50361d4f7..0933a18202 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc7bc85866567580bfbe5661c3c949f8eb1f64281a2f8be7c033d0cd1685f423 -size 777984 +oid sha256:dfe3bde6906f53a7a4e6069fcefb949b410e6edbdab87073564c721ca37e181e +size 748334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6293b5bf07..25958a53be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1def71631aeaaab5eb315c8267e308c9529b8902741126a4d283807dfa82fe3d -size 691494 +oid sha256:03beb39c781e4bd9592750b83235aa4c89907ff35b2ffca377b0968acedaee42 +size 666532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 24fcbcdba0..0a406922f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:281fa59cb08abe2f6a396001e356653b08f162fb8418d784d3da26e8b2c1ce1c -size 799270 +oid sha256:520981f9390331adfb7041c8804b881282a72e5daac5dcd4a3c7f8d925e0e408 +size 770854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index f2ad401d9e..77e767d582 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70d0e4da06f993ef6b032734e95637d3bacc7e1f5283cf56f251a053ffbea66e -size 714508 +oid sha256:eb29135ed12d4c1c7dda362a963ed248b7701c77e9cfd215dab7ade9c4771d74 +size 687522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index cf835a7fca..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a94280a419732d0938bbc06b9f6f84c3c3dd5d3a538cd4cf17674bd61ebe24a -size 588971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 78e2bcc859..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c5f01d48558b09037bccc36a4fdfffd2e401d021388351bfb0c396b8aeea6b58 -size 485363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 3d67b760a0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b5dfceae3472b5ebfccc515c384684499b444c964a4c7c20ef5765935dd1cff7 -size 558087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index f8dce5ddec..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4aaa32fe7256d95599d6455e571fc0a3b57a1b1aa2e42508307b4b29a27b72a7 -size 458327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 40f6979186..8665c6b615 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cc70a261155bc902f7feac22d4e8d81da323fe2ea03cf43efdcbc963151d425 -size 885992 +oid sha256:f3921864126c24dbf211d0a65ebd1d60f6fa02c1c29bf310d244fa48b89bccff +size 877012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b35651095a..4f30051305 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1df96c384a43ecab1b369308767af6f856cac7412d0f6db8e6c1f7db48696ead -size 695562 +oid sha256:be803c6bb21c5cf6cb2d48bf02535037737cc976a8094d6b8ce29d4184e276c1 +size 685548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 71f63057b8..0fb3f8d7e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afbe618a04c91bfe2d9adb4323392a9067d48b5c335409c9f8bbda237c07f1f6 -size 771092 +oid sha256:3184fa6557cc10e2c1716546cb0a0c08a2d9104a03bd63b448fde694063aa6d6 +size 760338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6901051390..e3cfa9750e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c11d6b58701df93099d6bdd398d6150c823d48ad2518ab67ecf8bd56af657fde -size 778986 +oid sha256:5e062a0bd3fd366473218585733f5e61e6651c644483c47369b62ba1972de982 +size 770698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 28e474fa4a..dd97d2b8ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cb500230f57bc9cf07b577a7a41f48bf1e4aa101d49dd372efa3e5b2322bb43 -size 647608 +oid sha256:0832fc7f489d6dff726db2d05f8ab2790c0b7501c89961308172d9be858a5020 +size 639124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0d47fc8f5f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:694e13131c0869947f0052edb747a089dc1b759f8482aa6cceba4e2c0f423211 +size 901802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index c7d8111f9a..4890878018 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9fd346fa382372d85270c411eb5a3b1be4cf52c21ca095cc261a73e7eb12b6d0 -size 605849 +oid sha256:7ac54359823eb04e00d99df41a60f593f12d1d2be0b097691e59a11abdd0d97d +size 609549 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ee99b6f862 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31003fcb421b83e26e37496d5d0ab4a0e134151af3e460537ba53d96cdf53c64 +size 673830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..da97e20bb3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af991221d97455dbf170df4e8ab508ab6d207c489363b9e1fe31eb4372fc1983 +size 793218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6dc2a45a61..2d53a86d3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b59b7de94ddfdb77465000848ec1b104808d8ca9d90d94ab476d808946b63f9 -size 566183 +oid sha256:ef9aef0b633c4e5637e38f671dac3aeef67a439398b1b4ba7532853c9a1f2471 +size 568007 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2ddb8ed4b9..14cfc50a2e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3fd5fb3ad8617c194cd9230d93056a5f29d13feb61fcec350c62896d264fd31 -size 856188 +oid sha256:d3c5c7e69971a8ec6f95e308d0c81dfe11d3bf2c2506403788118425364d7917 +size 828314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9e735651a3..c31fef2a03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e26ab30b60182c6b83b7e3d863c7a2a87a1d026f930f526d57582f16f117ad9 -size 684506 +oid sha256:91322af728ce6d85470421af4b716720546f2b0f0127d6bc88c9ff28b9c9357c +size 671778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 4f89ecf594..55a25ea196 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:321e60596a00922d596e46c3dba7ce24d5e771e4125306618a00a063764b4e8f -size 748540 +oid sha256:82c150420501ed76a3a09e7b5f8c313733c68c8f4bfa07e2fb39b671304c2fd0 +size 735862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e442fc0d3a..2d0863c220 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd14f5f2c9e1202eac49753aa752d83ad4ccc5a241aaa243a6195c9039a512d3 -size 720224 +oid sha256:bdc808389ee2d822a3cf6e0f275c0cbe7e6262730778eacc0044146aa2f786cb +size 709864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 818febd73b..f496f13226 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8927ae5347087ab5216585957aa430f77718a2d205e24242346f880451dc1035 -size 641682 +oid sha256:fd40d1ca7595c593bd50a81e90e5156854d55dabd5245dd1ec2c819c73e153b5 +size 631668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f5c249080c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51901de3fd2adfda7c3ccf1962f6b6baa55a5efedb51638cb8fcb44c7d2a4707 +size 853054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4a4bd11325..1165dec3ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa7689e8cb1283a2ee5861eb601c651f5daab5f93e6a8c9f37160662be8db67c -size 591783 +oid sha256:9258efdc3e535d3a04e3ceeaf7f9dbe74c274bd60955c335585bab97a83eb82f +size 592671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..177e4b526c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de383deb9081258fac1a6e3223f748c365977b8789595a11d8dde6588c912186 +size 649306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ba6f185302 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2d0808ee7aee2a16382d64199eaf7221b1aac2301b2117894b7f35025bdb974 +size 732136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 32f3147130..9e53205406 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c2f7fe85b059873608e4d4ba6d6e0c7c32fda0a711d1962b71f8d8fa5e9b822 -size 557395 +oid sha256:d544c78d0c5e03ccaa9004a464b9efada7ec051f64386056ed2f5b37538e2a89 +size 560109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 12b065e8c5..7bbb91ab64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af1b2b82fb6cbbccd6e410d2f833627b2e76e288573e6d252bb41b7f646d3bab -size 848318 +oid sha256:f2a14f92e514e87da32a5b1c8032bfcc837996fcc117ad83e7c8379e72404ba4 +size 819952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index cc8e1305c8..6541c913cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5584549ca3e29650c6805464c04200759fb18949ca09e0a210bf4c81c1ba71ae -size 836144 +oid sha256:aeb71aaac16870f67f2bf78750dd676bc667fb8f35a62b1998e03e5d8885a153 +size 806446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e9dd9bc42a..64e786aba5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1337cba5f621295964cb8312b3897a413e881e5eabda7c24fdc2bf98a39c38ca -size 758720 +oid sha256:fc1090b2b9dc91deb4937c31f2442b787c3fbe30a50ffeb7ccf21baca602ccfd +size 737804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 10814a3db4..9586978178 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac65d23c2a4bac4c799ee919c22570b4013894b431f036f37878ffba8358b98e -size 751036 +oid sha256:122a5a5040c16c5971fc7d0a69d946cf4d09e06378783809d339f8d4ded15399 +size 727504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4af33318d0..8a1ccbbe6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62f7933dc2abfbfdbe58439eaccc01627d12f35aa6507f6224b1204c7cd9329d -size 708270 +oid sha256:88df917c0544f52f9414ad50f16d9dbda7f789a1a3170604919964059e1ef07e +size 694606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index da19c1308e..99877e234d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b8a6573f1045c8d0bbf4202ac1648a6c0ee9d084f99c378fe4d216733278aaf -size 605797 +oid sha256:438fffe55d4aac82aba89555ac25effb724ed62d6fcbca3000b1039bd0143ad8 +size 597705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 583e71e46c..36f6e7b5d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:787dc0c3213d683db50a67f428fd589bfc15bb1e74c058e3c40a62915004a702 -size 796182 +oid sha256:e46a671d45edcfdce918e0ee7e3ce607f8e8e2ae25200005d39ce839410e06a5 +size 781186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 61a1dd4b95..92157b8e1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d265fe14110426b3dae485a5c56f1af133b38c519b18d872b0f17ab4b5d5213 -size 693314 +oid sha256:c205a56818f34140b4700632700db3dee89052284d2b799d2dcd3a6c14c22671 +size 677330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 3fc8789e7e..40c3ba5501 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75a69678d918028f61de7cd3f1039e141ec0cd81239b7f5c8c43c2bf7a503123 -size 719074 +oid sha256:b56cac961a76da5235edea73de92231e0dbf8c9b982ee5d15aa4f526bdab93f2 +size 705902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e5fdba08b1..a0b2547c35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3c87dd1011fa320751b99434d4611fe9adbc8e9db1db17b51abdf5a860b78e5 -size 631450 +oid sha256:a8b3e4621c491a23a2504999e681d2f2786d1364d78a6e43be6f979ffd42d75d +size 623656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 571b407084..7f0d0ea088 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ac60290a2ff1e0773f9c771652386d5f9d2a8507c563d59903c6137e1e60e7a -size 656222 +oid sha256:232ad4ffe03aedcd976f68074456e2683cd04bb3b45ece900abdafc86179c3bc +size 645468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 394ce6cfca..df024d14fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2577c27d9c8b9d39329f08022dc1187c640052dd4f610a2543fb8eb6164f2276 -size 560161 +oid sha256:e951ac29c3cec77bdb3302a3ec9f059e2133741bb375fb8b90ebcfb6dcdc7295 +size 554143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 0961924588..b3965259c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9829acf41e37219c2398911e4e9aaa5fcf4425bb1907f686e760ac051640c079 -size 873404 +oid sha256:9535ec8c7da8c99e3bf6ac1ec68334a1af93e465fa6aadd44ebec9df8ff9caad +size 841632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..08d3c4591f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa0d29bdd1bbb54fde9a21e4414eef7b84d214cbff807877d7df8bc4076d4912 +size 831876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index d013e5b222..d3be9dbc84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f7e9b6d0f3b438a15af1709f8c345fd3c4d7bc377b1f298538001f3e0551eeb -size 783164 +oid sha256:8f241e7d58c4d574bf0e16ff4a254af98faf5745edd60a1df4c60844087d9edb +size 759880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2408cb41e9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f70a3e6a181b77a37e888a7cfc465b7b6ae1644b6592d2d77e1935809dbca39f +size 753330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f890ead489..34e8b70256 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d29793c08374b7eec32b100a5c3195213f11b1a100122a706c3bc0099b40c4b1 -size 613673 +oid sha256:ad866b92bdb3aacc0239545f12a0061f4cabe45b5ff5dd4344d9623567e12b5d +size 634196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index b4a00ee9aa..59d23b5eed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f947730057395e9b88e809fe52c2536b620814d6f0005c7a88ff3494d7062dcb -size 509077 +oid sha256:e9e19da60c6f632b50e76c89eb9b1340b9ea6b8c9ead1a03f111c8ffea526bcb +size 528613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..391d0d682a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c091b2e3f2b580d885c6b761b4477cf013fd0c866a6180e6580c5886c90d0b2a +size 714560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8fbf58e475 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e772b6f3ca3936408e501d568e1d19f363161c3e5800d3e9cb9a7a034f1c4ac0 +size 601233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..55552ebba1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:016e6fd5997548830b1ab7a3546caf9767793c9c26f6276acbe6d048871d5c4f +size 729310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..aab35fcef0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f282433dbbfdfbb1804f236e7cca834aec792acecbc2f79887b70227edb0711 +size 646274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index abef82ba05..af0f264b13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:588359d8ad5ea1341a713dea30de68bcc04b65f2a1781b5c4d8c16c56565535c -size 581555 +oid sha256:4fc2ebe5e649133320e74070b8f26fa20c6ddb61cec160b4aeaf11b22ca2fbc9 +size 591667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 11aa6990e4..84f0005c24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3eb90c84d2fc755cff911570766be7e8a4ff0b5b123198da0afb3b3308b335a4 -size 480463 +oid sha256:4c73527c70d622465b9f2137714e5c8612e4f370eafac63091f5eee0926151df +size 490625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8e385b3f04..4d2799391e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2f8fba72638025f4c32f43aea43a395db6ece5e2607feadcffb4abd3437d703 -size 668316 +oid sha256:8c5a6be48f7c5f3d23b02839347e2bcf2ad04a3697a7ec164bf9456ffba40879 +size 655094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 1850cb4997..9f3d44d3f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a732c4ae537de8f5dd1fd2bd16f42c28df9232aa9e419df2e71fcc1715fb1a92 -size 575807 +oid sha256:5f97064cad7bc36a5be349b428dee234d72284d27fb05a2810322bd5fac0df87 +size 570381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index df90a052d8..a58fbc5aa8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be2f41614dd2773da31dcb4f145e9015a09b5b5a0e640089d97724684143549b -size 683732 +oid sha256:85c43ea8cf3d2ed8430d122738993d0bce1015af07ac7bfb67bf221e5cbd39cc +size 673914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2548b60cef..ae715fa4b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62fff0a2319d6a5e445190087cd2cbb1dc0a108763a69d06800c2f1bd6832777 -size 593789 +oid sha256:6bc247f9309889ec130aa289f9f06e5c3c0fd88680bd97050aba8397044f5ddc +size 586783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 04f591aea5..a982d57834 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82b799ef0144769939420451a9a5f3e818bb5f4428a82d8d085c156f6ba2d8a3 -size 664762 +oid sha256:3a1a9f62cdbcec8c944f1b2472a0602b62357562c177c8cbf98d294af788996a +size 655832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 3ceb913dfb..f4dd0cb3a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:584a8502a39599dba6801b078a0c53de6fa1ff065363e6938ad8a2e1e46ae45b -size 579259 +oid sha256:6f0a4425f1034392d7507c5f45cd33197601215829af9e397dfe0ef905dab634 +size 570823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index fc7bbc6440..8ac72667d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:685601bdb9d70c5c9bba0fbb84a826fd0c788187a3e937c312e410c8bf02f15b -size 682102 +oid sha256:647ed7612017b9199a4f3f3888a740d375ae64521c6eab0ebf116be21030a092 +size 673172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 4365adc0b6..7dfb0db926 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd5ca5e1ece8e74727aa8f5b05fb348b93853e5dc10a97ad447b0caddb34943f -size 596451 +oid sha256:ca68d9b26d783f72dc9a694cd2888b12567b1203c07f436b7b2634d98a3d0115 +size 588015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index a6cf8ddc9f..3c46975e27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:758a26d01facd2438d46b45308bf50ea10382f7af3d9fa58f1bec6453d8bb147 -size 733024 +oid sha256:c0a4e0dba1117e68d1a470cb111e1d5756b6f720a5e2d8c371e917c36283e502 +size 724094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f9d120c311..da94e8da1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bf4861d03e14b4983d57fc281fb5f8ee723148e0caeffe541702d1a3427fb37 -size 645646 +oid sha256:1efe3ff1e4af85da4c81d597d528e56521e6b78b75806bc9a0c694ec4e4534c8 +size 639530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 27a74f254d..76fa7d1035 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:514461374543fe058e45f15f289d806982097c57df13ea18735f5bca2b592e9a -size 751154 +oid sha256:03a02c5caa3aa4e66ec74554bd12ef784d0ff8da3843e4ef625d153cbf56cb7c +size 742274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 14ec7cef22..6d298c1c2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87e840d464e3279fac72532adf25be74da4c4fc2abc01474ccb52158ce95c482 -size 663776 +oid sha256:ca0f86d8b23b5fa8dfca8238442db7b6d436773a65b16d31a2d92b036ef6eb8d +size 656870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 86e395808c..693c8b8ea0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a937481a380e5b85f66111740b009caee07484f7fdc05d296d342b2e7540669f -size 800908 +oid sha256:7a027d93dec816e26f6b42654e4ddc8eed5177f82bdd4bab6e1124e977119b6f +size 792028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 340927754f..0febfa67cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a51ed2b4c4bf62ea7d1e1f71838217685d348976dca2d5bec3d5a83cc4d8200f -size 768002 +oid sha256:c1de4eee7262587d33bb181c334e6645c5beb3da4de8a847fdb6cda47a39cf43 +size 756704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 0fa20cb4f5..634ea1f1d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99fd6460918c73e38880df39ddca7187cb3df83afbada7d544ba5533cf125602 -size 854582 +oid sha256:91c7b8d69d4e49fc7f90e02052f15ecaf8b6cdb7bae7dd2baaf8a9955d9472bc +size 841706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index d1adb6a223..f8e0729fb1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:517a777078587ed1fa656e504cf2230d97ad269443ca26dfabf3425582390c74 -size 732778 +oid sha256:0f56a3fc77cf6e29ee4d6e6eadb96d4e488f37a90eb2192f9e5c88af7c3983ca +size 722862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9652a890b8..23fe782305 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8150d1ddf474e56256f80a34b06763560f38c1f00bef994f1ea93d58a7055b27 -size 726016 +oid sha256:50fda8f8eae4a808e68be4aa042523e9b2bb3be527cd8d39027501c161484ddd +size 714424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..66438e744f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9dac3629d15a0c0f29265a5555e0501de622b08f099d1c5803479046b2d0d70 +size 815634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ade0498ed6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dad757c9127589ba0b95764a73af6d65619f14c387f85a298a9dedbed22416e1 +size 638674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a5165f13ca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cca60891e6e09ac306622fb712e41c2140b7d506c54d71d4b19816f7cf2029a0 +size 706458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..273c819f8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df81b1e3161359b9ba5d4a87fdf3a4c8bcf5f183b0121a26840e671f5516015f +size 745134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3748a05e0e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6123300d9453e535af25938dfc7ed95fe9b07804d7af225937e1288aff9aa710 +size 601967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c123d87333..ff3f1998a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a0737d1d7547ac3c9fd5879f8812834a716340d079154b8983a7e1cb65462e6 -size 719010 +oid sha256:a2945788854f95a61baedaab1574fdabc3887180abc8cf9ca26ff33878b06581 +size 703370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4abb185d2b..0dcff28113 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21f39d1f8df2d6ba54f6fffa1ac80c9b6a478caac3ccf6f0de3c39cb661dc00c -size 751222 +oid sha256:716857ec9dcf73e485804fece01813826ed695c274ceecfbaa40ec34ff90b84e +size 737260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 6c064ad0ab..0180c3b409 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ec2035216174c976b15d953080462ff30113d0e051c2946a99e03e5fe2df69a -size 822262 +oid sha256:02be92223c8f6b5e27ba0d365ca35a15e18808521c7b7c8cff4d519d98b378d3 +size 808300 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index fdea480493..dd75370526 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:234347310f16abbf39476ee4b351959da6eea7b090c0b816f0f212f991c7a041 -size 666024 +oid sha256:56068400e0aa0d655215ba3c28e0ad4e9a1c6e2b536da09ed7a72a3218fa4c05 +size 651076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f7480da27f..6264c84150 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab07805c7558b1577192f82eadb667843fb24090119f1f875b351114e68fbaa0 -size 715700 +oid sha256:a32e3084842a7e2bd8f8bb57985e9c86ed6278dfaca10128acf5d6586d26b883 +size 701246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ad78463798 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:379ed34b8916476df8920985a7caacb3b6e80ef67b5fed454371f1438b5a37dd +size 729788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..67a33cbcb8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e08417ed772e146d0b96c9fbe28242c92fcd510f3c39cf31ce01d9a7fee5672 +size 618588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4c1d398bd4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e127ec41e6302113f5435c32b5d85b225727aa8c7e91f527f3fb5aaa2a3e22d4 +size 675470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8b7857546a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f3e0f4e0f115d3b884e48b3df8892bd29ddf60114f670d5199f0dc294e2c9d +size 671918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ea186ecf73 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5ba28aa4560a9ec113a4ecf792617b4e449c6918bc0be10096811f72781f858 +size 588147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 49ca673b72..c34de78017 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a821431574468fafa33c0621a128e711f585dd950155bd3eee77170807f6f550 -size 755836 +oid sha256:c3f9ae21dc47917822120e5352bf5e663641d1b4cfdf172ffbe9f6000564536f +size 743600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 70d008cd4f..a993d41554 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22f3ada162017b5e15bb210937d9372ea69de9ea8fd64ca3398395ca768c10fe -size 721068 +oid sha256:426faa582ee5c52f4b8f46e5f48a2a6eca595de8fd003abaabdb0283d9163bf4 +size 710708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e51e53d31d..6feb382bab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30c0c5926380d9194b714da975c217910cd7365e1b90cc9ef52f8f51bfc7de79 -size 665300 +oid sha256:2993e0a209ce4096d4f1b136bf67fc48176e5459a1dd408d9e4d54c39eb2711a +size 658492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index fba56d5451..c5a0169ca3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a5c3756a7a1588af9d25d44d0cde2ed8e42d3e768251c0212fe035200d74136 -size 633000 +oid sha256:a7dfbcfc47148f8c180765ac53ab33a238895c7290622153cd188749096f8ba8 +size 625402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 26e059391a..734c3c4e3e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57124335d9a22e7d5b04739c7527b6f21382e140f35158d9552d9e080bbd09f3 -size 778194 +oid sha256:9f3b900155abff80d9726fd2649d2b3f9094521a986ef48a9820740203105908 +size 765022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b18dd2731d..7998706232 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f92212af4a75881e0c45ef8b337251b432ac7771c3340ccc5d4e7cb027fd575 -size 673550 +oid sha256:de49e599e1db44064e1077bdbf29dc8d29629fbdb26e1fecc5530826224cc881 +size 664768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 878f819512..a3847510a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab016c3c76206f454076f7a7288fb058fa7d84316a0bf00922a84d12702684d7 -size 872716 +oid sha256:5416ee69f2342df8b871367ee07408c2015265130a5223fbade7648975ceabdb +size 857176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 90e5e542a8..d4a4e01a7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c3fdeee43a26366d7d956b6f899875dc5ebbd33abbd3bf6112aaa914ad40689 -size 766888 +oid sha256:9d000c8ddd524f55bac92a8cd4fae00bfd8c429668a894a0dac5b320940e2177 +size 757416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 0b70e0ea89..6c06d472f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09a469c0127a022c7c1104031756bab51daeca9ae492d193f3c2e3208bfc46e6 -size 670400 +oid sha256:26e993c8a17017a8996f680da04eaab10284aae9e144b80f9a4689caed4e501d +size 659398 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index cbd8f5ffe2..50ddab6cfa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35449066b1418cc3d879f97b52566e04f7d45266febe0c795835d509267f016f -size 578829 +oid sha256:2f1d09e7694281351e55a3a537107cdd97331c616f4a1d4f127e7e5dc86c6fb0 +size 571429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 10c95cacb8..87f69cef39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7ace20aa585623ed5da24bae7526628925011be9e4154a2dd21733ef5abd8f7 -size 714996 +oid sha256:52f09154aaa6516a6ba11f9041c4f8601a3cb6fb4b2999213efb51c2936125ec +size 714748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e670d03ebb..e89c935d0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2f1a8d78ae4de5e420703367e1e23355194118030f8328ccdef5fb9a42766b4 -size 625990 +oid sha256:e31abd35ed66175d3e323681845e9ead1b339ed076ceaa25df369fcc41f73493 +size 620564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d0791b081b..28b42a372a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c120d8aeaf85d2e19f42b8308d51c533321305b41dbad41aaaafb6e2b1e66b4 -size 772386 +oid sha256:7b6adf52ac3e71ebfa17401dd7e5e1fe90487ee0409a9a3c1f1f51598647e849 +size 759856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b77e74bad0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418f422b7ee261315a49397db395ba77823276fbb1bbd51ab00b5f03b0771960 +size 735300 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 0226c1d4af..17710678c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2a1d22728490e41ebecaf7976246840f34c18f008249515f3c32fbfeabeb942 -size 683134 +oid sha256:921b1b90ca250ed1077a1c7adf84f2089573aac23b6fd5df0a4155f4a4942776 +size 674748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c4e4398981 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6824590a5fb7ba79efe7a68cf04b29e690aceb0d28cdf9eb799567ccca2e9c6c +size 649304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1508f135c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14fc43616b589a32422f6dddc7b4a18e5db285a80de2d2f7748f9d0e74797255 +size 658436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..946deef8a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83b2b13500d5e90d875ce85ea2bdf7897c3164c0d9f8920bb13f24fd67584de9 +size 551917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6a0a034a74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d73d3c13f80ed8f671ff955a54a1c3d1c70a719fe00e197d27f8f8efb7c5ed5 +size 733916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9aa36571b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52b6b2f92912d5c4003f159ff8efb0e15a99a0d54915cb6e8c24faadfa0a0e7b +size 627398 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d484682916 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:596565c184e693cd376149f155d1d6299b786b1792786a05e858f29139b72069 +size 683990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9d11cab892 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63aba80722c6ed6f8c08b830cd4bbe3d1e573eaf941daccea871deae7eabeaa1 +size 592863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c225125fee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6130d1446af2eee030dfb8032a72bcbdf7a6746064830f1c7640465d6f0f3c77 +size 607225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..53856fad48 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b27bf2c39bf479e7df5443a7fe6b209a0c0b0de0b1157bfe9145b6728bd73ded +size 513187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 90b8952ca2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:405031492673ebaee5c1f1160eb55173c57590cc57b4a0c2be1071ad3dcce9f4 -size 638622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index b4fabe4d51..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b836b4e7a3db1c14f05b4418dc793293a41b6a0a172cb7b869576d6188a05284 -size 603099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 2bf3db98e6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1024eaf1d58d7f6c0b867a1d0bee112061c0849b7a7d84ccae4ea96d12f7478f -size 621102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 008710459f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:45fcc374245596dc469e1397a478b7968210fa61e75d23d5637da15cace3eb74 -size 591203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a3fc5e2368..748c286949 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84e16c79ac57b746fa56faa1fe96317ff4a28a50a3156df40e6f3db0ed3f2847 -size 755142 +oid sha256:7f2ac2dd4157d52fbf5220215d6beeb5d85accd8e4560d90b68cf92e6fb6b654 +size 742810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 82db06f764..77a5b76597 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27bf22a627724648afef9ece579f64e64d956d594f927d0ba5485a13cd6c2768 -size 669442 +oid sha256:8f708511ad72e470ff584bfd80b4cca910484dbd14fd1cbad78a79b102f745c7 +size 657898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 65946ddcfe..43b71661b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:becbc4dc8a0dbcf20d0f7005209bc7d9170e696c20ec81e118a5143a220a13c1 -size 776330 +oid sha256:1c9b9b63fd6c8dad30f038567cca9bde8fc1c353d980d41f9342de58ffa39a65 +size 760840 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 26bc6668fc..388479f81c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b734d82dcd74cc25cb53cb94aafb50720e8ca2617d31934573074d9495e7439 -size 687424 +oid sha256:3e71633fe3208c8e016eff32a022ad2c255b96a3e9e77bcc9a416d52755e922f +size 675978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index eef63b77c9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb60dc0aa9c8811c8df3a5e172845d226d8758d588893215357a381e78c471a6 -size 644374 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 5aab60de23..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7f7aacd08402cef167e133f1802049501b7facbce3b5832d1aace1717d1d0fea -size 540913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 31e8439823..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b1bdba1d6054c491f6a40cbd9deb662a3bcbbafc8748f10c3c49f20c6e2dcc4d -size 608653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 7a23cbd079..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a5364ed5382bd791cb275849b56f78783b2855940d831ff9cc51e14ed722126e -size 508795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 35956ce4b2..1cf60453fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea4ec7e7f2cfbb50dc3e6a6516e6e0bc7a871159ee92f88763bb75c107d47342 -size 852740 +oid sha256:9d8679bc16fd09f41271f2eec1d9a4fb2a45ca2c5fc67f5d04531f87b79a89ee +size 841690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 47c708b95f..c80c8e8894 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af6cbd631e0db4d07dc3a932a2e699ee7cd8a3f465773de0f90e46b3e6a5561c -size 797386 +oid sha256:3bad178038ab85763f639a7c940ceec965a76a2e57834c6e493aa43b8b33a46c +size 785646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 0961012b50..ce3198b5ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4df8f6d3b4a1ffe99fda79c64d5e4e02afd28c1edb2422e85438a1e8936f41e8 -size 887322 +oid sha256:a437ae6420bfd9a274cc98f117932fb94c20af5911a39ea160c80c914fb521a1 +size 875382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 8af081e7ad..bee042ff7a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e45064d6d4602f8b068fe6cce476977526e02ba85fcdff8fd6fb60dad45624cb -size 770846 +oid sha256:12884af1329b49c0e77c22b97f41c0513066ea974604ff2c09305473d22b7de3 +size 758660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 07d7116d93..74b5c154e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3402129925cb24d9d4b9bc0a4bcf0311917cfcee1d53a1effe54644c91a65eb -size 752196 +oid sha256:9812ecf1c192b7d50d9e83b3402bf497ccdf9cdfceaf4e7ed80f734ea54558e6 +size 741194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3d190a2cea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:770588da49753404999fe792539d08a4e96fb00b3d0b42df8de82c137ea1e73c +size 865492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9e40f26abe..81a9d434bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc01602f21d64866b6b017d72077a6157b8cb87eaf1e2e60b59aef99f98defe8 -size 664112 +oid sha256:49cff440973d5f7c1fc10bff8a8dd7d96406370371a26e8e1f9b072359cc296d +size 667270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dec56db300 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f7f76fb9b56662f2477389ef74bbba1aa59fb318ed125868bde1f67b0f02203 +size 739888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d6d6471257 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:115cbd210af598fc0b276224c20794348ad262db5f23e3b67db9d2e62cf05135 +size 780934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index e7a87ec3a4..e363ffdf10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eacf25938b5cfd923c584f1bd519d74063a10712db0dd014eb4b79f5928a2942 -size 628540 +oid sha256:51d720c1a016aa0eacb8982b5921f3f704a87eede5f94a7bdb351b732efba409 +size 628492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6726df40ff..1f074ef617 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36ebfdeae1fb5a714ab0a6479bdaf73a7cf62e2b59f99c9b2bd8f66bcfb538b5 -size 769460 +oid sha256:f5fccbae03c916c6ea8ae77622f60dad104c3d8f7df3e2105b4a3dbe3ee1c6d7 +size 753822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0df8d147be..5562bcc9a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bbd0c70668df940a4c7660841eb16a6f4ecc7ac3fdd8da1162fae129833540d -size 779818 +oid sha256:c1d6fb873b8722e2379164231b3661be926b217d99466246923bd05a64fb7799 +size 766202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 046a0bb788..39be126c56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8053f49ba2e31781d9f3f32efdae8b2f9f1eaa3730c5f52379fd3858ad861b0a -size 855594 +oid sha256:7bf349ee85d88c6e7c3c8aa79c1e565bf3ef9d0755f150916141580360101127 +size 840202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index a8b887787c..dc11ba59ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a60c9876e0a477305d7a30bf9615637046111e98f43d24a0f97bc8c59bc0b3a8 -size 702956 +oid sha256:87e20de80fbf3e860dc5679d574cb37da5f0b84cde80019afb85a271424643ee +size 687072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bd96411325..6a639fe5aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28a3a523f6587e85d2c56fa8eef34aa752f48eb86732025652a43c1c0d9bde07 -size 741928 +oid sha256:493cc58b570a841f36949258bcb6f9b5dbb468b5100b96c5f816899f9220d45f +size 728016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..181981531a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b51b875837a0cc287e6763f82520bceedcf4972912622875d9107f96dc11e35 +size 779646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 80346a26c1..8dd7aea09a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a40edd84ee85b46b142b60fcb19c68cf2613bd9f4b0448a70d28686490890758 -size 645804 +oid sha256:795be936cf098413b164f6774287537399cc3c58183f51b9050ee84fb0e0fc15 +size 647234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..69af1e1eb0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5914ce44f70acccd50d5175e3897c40a6af0aab911192129e02d62b23b26ed7b +size 708112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cd79280357 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c561ee6c64073c406597bbac8b2ea93454ac6df2604ef41015de6c274f92880 +size 708604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 63561e1c77..dfe73ff479 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26796bd4b1d9783026a0cd5ba04dfb92547fa257d1bd406ee09de70d2f12dfb2 -size 616891 +oid sha256:1f074695dc0f5a5f38df2cc27e31a5ade56e321b67b5cccc0fb272392c86c470 +size 613931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7d90bb6f8b..3642043051 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f87a862ca3cb3d40a75b918f0bf1e054a690b6f39caf9423ce8037362335a23 -size 823108 +oid sha256:8d0531335df7774fe1c61ca1cb3336a45e8d61b2f9b91d3eb5c53f92fbb6655f +size 809788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index a8cefb1c58..74c30abb3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdba0c022ebf05f355f68ba1e0fa6bc58ff73247dbfd16a87e1dece05a29080e -size 769150 +oid sha256:1786487f78575f7558724c98c593217cc05f46d5ebb076d20093cabbfdff0c21 +size 758592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e5f599bf07..45fb4e3bc3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8821bc71cdbb8c0bf027380469955c800c3330656aeb528a899c558b318b0af2 -size 736076 +oid sha256:1fd4ad54dc4518a9eaaa4f484a16df1b07ec08639a90d794564660166f9e783a +size 728874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 19b7cc4dd9..633a3e182b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb72665885359dd4a6049f209ecb3921d70ea13131b4cd89a2dcf99e2eb08188 -size 682562 +oid sha256:73dfde1e74a24d7730bd42478a91466b6579edfb541762763f174c83acf31b97 +size 674964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 67c3b71fea..8efd7f2ead 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9751d79ac74df3491c8b2761e3f3e22d88b1adf008bfb09b5180625fa189e3fb -size 806740 +oid sha256:9cb0c2d63a7119bc85f1a1ac2e8337c8fb0a1a5ef4eef2c234a1c4b74734c7cb +size 792532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 121602b208..fe1861732e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9baed122b03af0643d061d9825ed5c285819fab2377978f1f8fc947e5e26cc4 -size 700862 +oid sha256:d75246d2f1eb7a380e8500dc36aa62a3f45ca55147fa102a08628e5a146b19b6 +size 691194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 82cc81c787..3d442a8291 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d16bc9cd4a7033147dce5c7a3a8e4ba8593dc150039fb498ae24cccf1e240e4 -size 907084 +oid sha256:d486d361189f30b54c458869d61d33794cefdf6bfbdce10d4b04b848201e26ec +size 887104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 77e2e538b2..fbdac31543 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc41d92f971d8c0b0477b49161db7813d44296baab728fed610694ae14ef021b -size 800466 +oid sha256:fb7da762ebcbacb66e7252adcf20495a2ae2dc8f27b5e908d2693751e7eba9a1 +size 789366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 5ef327a023..9d83b11fbc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:559912acf9b01a568f787524297e7abccde1dd421ca5ba57e3c8fedb52359e26 -size 704522 +oid sha256:053c96f1926b891ad97c9deca2ff0f7e809915ea2c83c585d3adeb4dd07480af +size 692188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f6bd2153e9..45133c1473 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e158408b6056beaa3197d7eb1ccee77183411378c6766721b07b5e848c9dfaac -size 615417 +oid sha256:4e49ab5351f3cd5287455df09f4ff68c598cf6ae4b0a913caa86f509f6e9d015 +size 607573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b05d02f16d..ecefe9c068 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40e15c2bcfc7de0aecbce63c737fdbb7612636b0f4e3c2167ae77b7cdde17edc -size 751238 +oid sha256:846504e1dbee6c09566be59770f303567be9dd1115c2032b2f6d5c5c79f22cd0 +size 739940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b96c9fdf0d..3572f927df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3caec4ecd674b7af26d59b5c4668c1b10085ed8e1b5137ded75535453eb80d4c -size 650788 +oid sha256:1e6b821782f14ccabb94350d56764ee7523f147fd046017cd3704d8ba4304159 +size 644028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index adb6047310..1530573395 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7973b29cbdd438ca2ef0cdcf61835eb6885ba5407358628bd8590fe03fedb80 -size 843212 +oid sha256:2404e154a1c0ccc7cb674c4d289a066a3d1b6809fd3a89e55c09321c5c208528 +size 827820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..77bd58e779 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ba1a6b0397320323968c4f88b4adf263e91b770b6db8fa8788a910d9da47e2b +size 781804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index ee3025f754..c6abdaa553 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0503daf2a9092d14aa9820ce1a7eabaa989ce5d73160dd502b2b377f277eac5 -size 754354 +oid sha256:4d24ea56e54fa55f11adf778a6bdfd24242d4a32b554657cdeda68f66cf8948a +size 746164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b2d176f23e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0795b181663feb21cdb00c9f31cb011c0959d2472178a629fe707da5808da3b9 +size 698768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5750a67eee..828ee72e48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03e08d1f294ef9d5cf83630a80062a633146f5431724349d50018bea49aee3b7 -size 668680 +oid sha256:c81ff6f71ccb6f138ae3e2562cced6736f574dccfb108722626279885d4d9fb1 +size 687920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index a48878cfbd..4505957e1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ba676ae5c8cbf0bdc6d2e4d464497c7ffd0f3cf3f857a9614789a2b243b155f -size 561667 +oid sha256:029257f5d1571bec4797b9ceb33dc93ec50f7dc0a6b3ad4b89b80e7e029ac645 +size 581301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4047d2b989 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff48ab624c711c8b104c12a6f3e48ab456de5c75f352d147e2cc4ccd7c79a12f +size 768728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..db71f7b4d1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0456e6028a548fee1ee4ba4c188d5fa3d1b50eca9e43ffc7147874eb7902226 +size 661716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..153045be84 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd92866b0dc32ce6535a52f49e6656ccecfad2049c569c927abd1222c510d86c +size 715794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..80dd04a1a7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23ae2366b8a523ce178cb24d835056d70db47ec48565bd92e52d636ba131f8fd +size 629008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 283cf00485..1486a98231 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05799a91169500153c8e9d1ca36869d0b7f21acbc679635eddea3c7280f3d1e4 -size 631776 +oid sha256:b9ec7efe97f84e6df8587ab89e1a308c8f31892ff5b4d0bedac3ca4f19ee3248 +size 641988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index f08e635b6e..3601b15bcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c521d8a3066f6b728988d8cd23a8d449d13e3f53489fe1ef0530bdccfb034b0 -size 528759 +oid sha256:9673fa198616c9cb46e8d993350432d9f42f3aae8d8df489ccca511e39c0f26c +size 539515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 17ef7a3776..9bf515f28b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7dbd1e5b2bc47c543da40a87b27440efa8d4da3aff71578b64f67ac3fa1b82e2 -size 656968 +oid sha256:d3ad4205b0804a27b8468b454d026f4129614bc9487eef1cf6e7fd69678e4cf8 +size 648532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a75f866c2a..ca9b3528f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92a366fb7ca8a64d1d0cd0df52ccbc611191a3394fb30a9820996a7707baf4a4 -size 574127 +oid sha256:0ef2c8798c1c2cbdf661a0f62e3230dc8934275a9429de8bbcdcc288b99bbd88 +size 567467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 1b0fcc82ec..e58f180b36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31d34d2ea418d8138b311658c518365351e80eeb5dcdfe6155a7739e50c17958 -size 683582 +oid sha256:93290686521704e63ae658292e1761564947e9dd20702d384089e95c1dce6bf2 +size 672828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 4283623d94..08c3eaccf7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:430db04b16ed38f81720f309ca2505e5f1c34e87dd44a23bdb829e373bce745d -size 598177 +oid sha256:1251b3dcbdc78b1fce18f3efeccd0e1cc12a0c8d2dd85ead99458a5fdb933e14 +size 590827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 6ec5218e1b..e7b5510d0e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92291ea1c08de9cfa74472e7259d115630b01a44f98979b736d4784283579131 -size 660172 +oid sha256:2c9287d06b141a4d496d0501c42833747809e7d5764bb56ac8cd3f72ea1c16d1 +size 645076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index e9c8325803..877b1d3e5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9970a553eca45f2b61d6100b3e2bf33057c844da28a811e713fdf81eb1dc349e -size 574717 +oid sha256:26b3f6a797db9eb399d010aedeaf215879bce62198e59223db1c6e7ca81f7987 +size 568057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 76f02f494f..d3bfc45bc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eecc32845babbe543e06953f62a89f1fec77b713dd1e0f33270615975fd9c5f2 -size 681360 +oid sha256:b195305c6310a4a073cbdad9e04c03d241dcf25e397b24e2ecf43ae4a604eb71 +size 671246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index d056677feb..09e9c2adbb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06283071bea31cc3a2e5aa888c4d0c3f57d78510b353e3bf48798f33eb56faf8 -size 599309 +oid sha256:201d6758adb7264b4b7418d602209b1c26df265ba0bfa81ac03dce6bbeb9f2ab +size 591417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 51d210860a..c25ad7fa79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6b6dace3e4d94307fbec90d1b8b77a923a5b684112434e4bd848d1c7fd767d7 -size 726116 +oid sha256:91fb156dd03c4e2556579564fc6b431c536a3fb7eebd0c0998076d438c6d3a9a +size 717088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f29000ffed..ba4e99c983 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:069fd86cb937d403867568812bc9a0cdef2383cd7ddfff1f267867ad82c11c8c -size 643228 +oid sha256:bb28546bd87ca3372723e4ca1bf489e20cfae788f4c1e216a380643c06a78ba8 +size 636370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 90c4ea9288..ed1d1ae945 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68ecdef4cd3433db4d4cee3ba748bf8f729a8ad1a34ca8b6bcf0e22f68548eb5 -size 752582 +oid sha256:c6be3c483a5f4afbac788c8bec2ed67c9c9115802b45415d3b579721a156042c +size 741778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c8819ef822..e86cc463a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88a787ce144bebfb13e0a4d4c3184ba70ec01c3ffa1dc9315543580670725998 -size 670236 +oid sha256:1843a16a1275ca2e796355decd03c218c69bdae9e6d2ae293fede06f7ad4ec7a +size 662244 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f48de320b4..1fc53bf779 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a0b0182c51af38249b701bc35936629838bc3c42f655b0c94c51d48fed27d6f -size 777770 +oid sha256:f379e769f8b66a756a54e867e7c751d8dff36b7ed6e93b9d3007c116c41b996a +size 763956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b5d92cdc54..2664d07704 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b7b07693cfa2992b17d260c481bb2b0827ec10246b6b3b1452badbcdbd939ce -size 645504 +oid sha256:d354d3f896e90789c2c422a703d2453ae8c7f34017204ee031aaeadc16509616 +size 635786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 4d7c79dff5..b4cd96c567 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67d5c3b7bff9000ef60080030085cc89ddc12e8bb6050b5fa91e20f46557e120 -size 710280 +oid sha256:cd40f4aa69248d91cfe913f8d29456a511dc14ce95b7319082bfd8112e0cc91c +size 697156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index dbb31440c3..16048009a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0f412c0c15c656d48f54ef75f607e513901b655e944192fab6a32fa67e503f4 +oid sha256:5401d5483fc4046d27f6bb2aa59923a7fae95bb5d8b0ee345378e5ce24cb5666 size 702484 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dc56ddcedb..596afe10d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e02d69d1e0f4d8b3f0686d038930e620d71bd54f7c267fb2ddbc8675915f9da6 -size 619998 +oid sha256:7a09b99e34245ebe1b010121671d780e4bb671b1a6c4ec8a188be79465a06119 +size 609933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6be68774ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d78e5512de6751e35142637ba4566228c7e0e5b77f73409adc1e8634971ebd80 +size 789978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..333b6913db --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82c491b4a0e66f09b4ca7e0e0f22da47cd1075061e73428a91daac0f087513ec +size 568223 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3a3fc1117f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd37e49e79fa5ddc3f0218a84c39f840aef1b0454d6a96c5137bba7e863f38d8 +size 625400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c2fe383ce5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec3f04926413599e6bc040d28b8f91c9c8cca5a9488c0c141432b9dfb82b7261 +size 726584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1ac5630937 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd0358b4695dfba8ea255c06e3e3bcd9c4f218f7ebd2f8faa0e10864de89eaa8 +size 545379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 541a9c02b4..a0a376d412 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48f9c94763014e7d99b16fc29a2753b377998f23baed3738e4fce3b77c01cf9d -size 715604 +oid sha256:902757f7ef81f5937f86d21944f8e1ddd3193bf83143f4b90321339a75af318f +size 703862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ee77deb9ef..cc580defb3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55fa7f1a4ad608e02600d58bebda2ec26d9584070f084b9872a5c11516d15233 -size 638098 +oid sha256:eabc0d04f93ae170fea5db9b01b371709dd96f72e3953776838f522b4f50ba1c +size 627542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index d71e977e48..dc68dfd547 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa1445fa5949bf1a92c8d6fa617fb076c3086553d94a3513fc36ba6eff2a22e9 -size 699174 +oid sha256:06236e07da2f0d61753efdc125af7cd444eea05aff2b456d05f08f34a64c3602 +size 684966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 0f9858f57a..61fb0b8d65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:174ea68eaee90aaeb488f2b9e7356cb472b605f3cf062c734c33c364ac4a3d0a -size 662766 +oid sha256:7309b9114ebc9b82e4fc9603082a1f31a54db905a4b98cbf01fe1c68ea891454 +size 649396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5ef91623e7..a9d629ec35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83f49d1609fb459357434061a781361f49776eee2f24f3b5f9a43d98dbd71a00 -size 613281 +oid sha256:c24ca97680e156393d44acf7856092ea42baa1ea0a4a05a4bd1a02ee5c09a3ee +size 601687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7f502d69b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bd3f2e25985d1a23f6189a7ad290bac381061bea7081c12496078a2011607d3 +size 730674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1a77f5c195 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:297bd7c2974a32969298e31ee22d744c4726427f753ab6b0d27e30c5bbd00360 +size 559533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e6bb1540ca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72f3b322835170d59c8e378d20e660ad15e28b49c5dbf1359aeccdbf4a581029 +size 609261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..95974ab9eb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5996bf2a1c8cb560186bd020a08afd121550b7e8ed0ba2408e4c0fb9bb6b6a51 +size 672706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..961da90974 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c095c70443067f9185e7ecb5cbe469f8c164d912fb6ce060b610c11b917989d +size 536691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index eac21fab85..ccc69fa806 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a53c6f686be44bc6387f85bbd807985f6e74da4ee52b1377fe4e4de50706f508 -size 742070 +oid sha256:619f6d21b243132f4d853f14624c8d4c4c322066080cb4033fa40c2a0b8425f5 +size 725198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 6db96f5273..7b31a4e02a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:041b07365621600daac7c5391c38bdc4380c770bdc276cbdf0dff083f992ad10 -size 722594 +oid sha256:9f0d362450fee8316c9623b47dc4d3eba813eed87bcd3658e6c4131e12c40bd4 +size 712974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6a0bbc8ebe..5c9a0f1dc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0758416c9f1907705b6393f1759b3d9d5cb11ffbb4ce34e4b38dbcbaefd317b0 -size 653902 +oid sha256:26d1f2b74f8fb42b3648714b3a3c1042031c11443d08d9fb5242bd88466bf657 +size 646354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9da2e7cf38..f754f5f891 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf21727a4869b60b644122e4824d3ba63f3b3d814f04513e2fd41f97c061f469 -size 633294 +oid sha256:6b7a9e0f99d134bcfe9d81a04a27f216ca6d45e49ef4392e9fe8e90230aa6ec6 +size 626534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 16367ca369..55955f0b1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96054372cc71cfc939843ee353176248d216c804faf2e4913b50ae54a6bac495 -size 666056 +oid sha256:f268741d0af08444b8d33474b7d98fdc9935da108dfe7145d64f6bfc593fa321 +size 653428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a929922009..4be345b283 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cecdec331c30c876100a6eb97b7668f125355beb11f6ecb3c01a60b1654842b -size 568121 +oid sha256:01ea44457aa12e97f8f4ed964f9dd48c778070abcc008cb5269dee7385167397 +size 561659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 0704fe90ca..2551423668 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab11caece54abf6b435fd190e0a96cdb255fe9ad6d48ecec5cf55ccaecc3c450 -size 739564 +oid sha256:ef6eea425f45eb30a9ef278392680e5911268ec438a885c3ffed8c5b4fafa6ee +size 725356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 0cc2843cef..51d5ce3f88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8761ced7998e5be0627c29011db27229af015e9386d1924fe4c9a1f1e04810ff -size 645724 +oid sha256:df70b6359d081195c6f1f2d625112a5098c1efc9019e8fc292b3371a6b16acc0 +size 628408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 9c3029a5da..3ff4a569cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be0398c61204a89ffc1f051c400329677f48f47b81df8c10837863e59d062717 -size 663738 +oid sha256:98ecfe1bf0e9f57bcfe44ca409732edac1ed0bc98468a4e0a92ce09c342ef955 +size 652588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 145be76942..5a3fed05d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfab658536df9e82fb54a361e626dd438129612c0bee9675a5c43efcadfc8cf4 -size 574387 +oid sha256:89727c3951dd6ad6502e7178cb70a21da9fcc0fbaca765a972facd482070e42f +size 558107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4bdff47fae..9a3afb000c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7a0bf11fc0caea8e4683e416f36f11165045c729f40a4f8e5e00cc5f7f4c8e9 -size 621606 +oid sha256:a4834d05aaa34bb9103c1060f8ff98eec631c4c6c4f602901fa4f9e42cf21922 +size 620766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0c38bd5a76..61c2c6830f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c755871417ddb9164860905cf51f276918b47c9e96345134e9605e2cbdd6062b -size 535411 +oid sha256:f608782effbf533fb5d60a94b7ba51bd3b0fe7748768f1515fc5494f30762542 +size 529441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index bcd06f6747..373555fa96 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:165d7e518b762583d22b109f35bb83f2534a153302fc948b3b042ccaf93b3266 -size 767796 +oid sha256:5338832626d511eeb6d851862003cf4a981e9d7d0d581000be6266d7c07f1256 +size 749494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..eb4e1baf76 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88f8389f2965e5665ce6889a387fa7116b443976fec853a30abc2f96791889ed +size 738406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 152aaf1e14..34294a19de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:feacdc584d94b86f1996bc7bbeff969bece77c1e7208ae54b2ab701c145dd5aa -size 678002 +oid sha256:ed053acded0e68f0c14af1b51dfbf890c9cb58122a36b10d9dc09f9153d94afc +size 669714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..056daeeac3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5abb1ef03ba76f5de8e4c2648015648c8d98eaef04c142981a82c7b9e441abec +size 651966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4a18977820 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e97cd7b7d62be96ff5387a8af5ac944fdb430e2d7cf294f16bf32795501e645 +size 589613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d40f5971ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:057daf0d1923769d2db0f66e6a4118a9604706de0030f79c48db8ddb6ec429c3 +size 495971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f6495960fd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:005f8665511bfb241ebc6b1172d56ea8a7f0d7373688915bb62a9c4a0931ef98 +size 664058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..375ca398dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88e51834dd353440965352fa6031dac0dc3487e29bd10196fe6d0870d9a9eb66 +size 557489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..35ea14dc2a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bd5ac1794100b701337751ee9205b91159c18d55e288a1110867fa60ed19bc4 +size 675996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2363cd986c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c2e346390179589adba9b17766839cb33428ee67c140e360450e97a56055818 +size 589309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0611026eb9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92c8123f063cacb54ef350e6fc039df87d94c61732bfc28268014ea5a65f9083 +size 561393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7584fe7840 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f501e78fe5a7e5cc88d3315c01b1bcd358dd67fd0f555adab390c1871d3c5584 +size 467305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7e3220d242..9edc3b5825 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6158a672c65185ef06cfae8dbdf77f67d8ff3126ec366fcbba48a2b1451d619 -size 773774 +oid sha256:e83f65f03411320394415f1672ccb1d5edf83f0e70e542f4bbb331959f145441 +size 760798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 39b0233257..21405404f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80cbe445d25c4393506b70b6fd1a7d2963c768138a05a156ef82c2746aae63ae -size 799500 +oid sha256:d36cfda1de4f1276c1596d9eea25d61c7422933c5b75d206a03943d44eaf28a9 +size 787610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 52b3daae5e..31bc7033e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f84258d0c7b05413bbfa3db9da3d2e78eb8e85a0cd09653102fea090b591ba6 -size 712940 +oid sha256:80b4f76971bd2ccc4d94f7548ed2aa286bd23c576499ef9ca7ed4a4c984c591e +size 702036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index cdbf6021b3..f380498cab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac06cc60ac19d41a7011412a7f5ae1d1824f483e5d8bfd16e1be543ace0db887 -size 739258 +oid sha256:7397493a5a92baaa85839ac502394da03e46d9bcb7c798f5df5f483fd8f43341 +size 728848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index df043bf74c..ed5a4a026c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:026158fed1edf0b71ecd20d23d24abc86517b4908e6d84e94a2eaf2c0fef94bd -size 705328 +oid sha256:52abe0358557016f9ced85af42d460e8b2c77b0b81c1778f7a2fac6bc32818a8 +size 698372 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index fd3150b8b1..c13508ee8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d85dbd43f99953c7f9301f00bd1d6925524535a4423f694226b1efe8346aa560 -size 620516 +oid sha256:1189cc50d4149504b6789d8a2db1f0020e9fa6d9653935c0139154da2fd4de81 +size 612671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 802d75d965..564c1fcd21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a32618f635d9c2fdad6b1c176481806a806846409711285bf9d83e41f46275ff -size 733374 +oid sha256:409314f26cc1015058ae647a9c661b089851a8d225a6867ad65fd5d3fea152ef +size 724000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 635a6b1df4..a4f1e4133d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b51b1ee375a06550f14def0e422861b6241ebda0419c98e91edbbc5119a45bf8 -size 647674 +oid sha256:9b8e3ccc2311aaef198f1cde5e78aaea1cb8fb95cc30246eec1aca5a9b0f7b5d +size 638794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 0d76db5bb0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:40da146e61c55d111eacf5b48a4e9c5ab01f685535624796a229fc1748a74af5 -size 567085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 794fe708ce..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:47b56a9b4fb6455af490b2a860adc44016f36819ac513504b2937ce0be0fde3b -size 545525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 010c558f5e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23d49db26013112b0fe02b23ea0e4341ad57c74a6447f617b2014a22670245cf -size 558003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index d90ebcbc29..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e180003fbd0e889cb7fd1c35066edd2d80b1d701164eb702350e3729f666196a -size 537577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8b2826be95..658d824152 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:299c1fb0d95f65c15b08481ec0e4b639b78c7f3875c6a0d7ba576f0c6c046302 -size 739848 +oid sha256:5b4802b1ef23303a8cad739b0af9bd33c670949307c9692d1f9e7e02de529f32 +size 724356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3ceb694dd0..89b2e6744a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b09d7e555ce5d470cf1c3f93fb28f999f9ba725b78448178a0bb90d3bb2a98d -size 655726 +oid sha256:115fb3f3ef953dcea6ac16aff2ec08db2a60455a476f23b13de068cce909e449 +size 647684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3f150b6d4d..d0ee410619 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02342b6507ca0282cbf0560bf097d4539b9376dc24c0f1aaa781235f15914d11 -size 765870 +oid sha256:b6d3f0c02353e8679b97931fb9e6f98d847df3e0e3aa333331a2539450d13a4f +size 749442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index eba5cac57d..36d9dd3233 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ae1e182ea9545ec933f983e5b36160a4ba3e40737dd55d86a7561d14ba0d7d1 -size 679528 +oid sha256:cdf047230c0d4830b23bebba521114acabdeb104f8e098c4728c6b3fb2d7bf97 +size 671044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index af460bd848..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:43fbc888f7f5e2696dbf046fe6f3629eeb1a2c34abdda9cd18ae0afb3515efa5 -size 589167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 851201fa9f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4e2aa86f6a29407037af6e8f4f4942849548eb7d21c65b2aee28b4367807b5f -size 484573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index ababdf803f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a61fc8bce2ec7e32dda70acccaf4b298f027a0bfc7f5f6376d72b9f7b104b30 -size 562871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 20707baea6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcd88f1b30e8cb8a717591e0bbd759c1bfa05e339cc6b9923c822266292980ee -size 462419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6bd69466e8..571960873f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77a34d52edb6ec8975f9b297b63eed64f33d734ecd951676d4bf40dd11ffb48a -size 829010 +oid sha256:dfc06965833818765f63dd25b80e0ef6ce55de46a741fde72a935f48fe83adf0 +size 814802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 83074d4b45..fc0b810266 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1adbb9dd5579e2afca1cca1b9cd8d2ca44b0c512dad4e6e52128af6725df65b4 -size 675186 +oid sha256:affaa4c3df372efb76155363e1a9854def5c7a8ba2d7440f74216aeef95c07bd +size 666010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 222cd567c9..efbe19c2bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c090c0e3b0f0b0e3944c32f5649485edcf9bbb64697d4324b91060ae57bddede -size 744696 +oid sha256:fa876af292a5aa4f346a76eafc5096b354ebe4dbc556b084602b874141e49bb7 +size 734140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index ee6ec00071..5c0392e296 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:581ec682bf51230991c08697eb67e8ddfbdbb607ead3b1efa419e71dc24cd870 -size 737592 +oid sha256:bfe904fa38e003362de55a879ba180bdbad335e1656df37331e135863170684e +size 743512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a6427e2691..19f91bd32a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98d8f41a2f3f28aacb06712d3ec0b1951fae1574d14e9881fe320fee1018bb3c -size 647262 +oid sha256:da9611784691f6801b670746f6e33e12fe649f5fe8b1dadad992d3ea83bb4492 +size 638726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..714ce496e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9088c8d5dfd3857860070254c2cd8bcfecffb6bb20b90e881216977922a9eb2 +size 841268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 05d5283813..15362423b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03623d382435420df6776b111db1c45c12486f5a5f5da7beacec820d387824ab -size 593365 +oid sha256:a6efdf747dde03b0949a292d3dbbd1cb07e865bc23912ca0b51c1d06de9cd220 +size 596867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4edb700371 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fde4bc341ed6aa6e1e48a761361096aa93a9eb53845f5c615f8c419c73706a9 +size 658782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b8ab3247f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc006e3cee5566aea48665bcee839d54f40d8526e7e1b567c1865e9330a156e7 +size 766822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 73b7f8f26a..4a72c20db6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4090e78c64b70abeb6c1a64a8ebe47b5a3a4760dc34da6353601c9a88ed489e0 -size 571015 +oid sha256:6956a4befe7e6f1d39f4a9233545a2eb4f4039782fdb56abe2c9fb785e4ad822 +size 571163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index db036fbbbc..a514ddb0a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4613ec04b2614aa0db8011beedea1ded19744b1d4c912e77d78e3f3d2b4da25 -size 767436 +oid sha256:6e083d3ff5fb5899ac6cd1229bfcb339b6d93222f4ced2447fbf90ec111d5b41 +size 755496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ee4345653b..fe06115dd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ee874b8c0c1e9991aa3d7f76365cd5e6fecb3a807960cca897ac435322b40c7 -size 666990 +oid sha256:69c2bf7975c48d4ea221b380329eb1ddaf6be63d3b9e61cfe7e595fdc92e9595 +size 657716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 3fa4f0bc3d..700eef652b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc1151e560e871e9cdd969ece72258a58bce0f5168ac46eee0cfaa0d2e5cf551 -size 732802 +oid sha256:dbc533d8113180c1026177007c846419f0bbfbb5246abd41ae48c26ed824fc13 +size 720172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 1a53f3e219..8d8df91e5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f45e3d0a485eb67cc1adceea79840f1dc03cfde732823875eaed6b411f0b982 -size 701622 +oid sha256:d7af654d0e03fba68f6b33b970b98373cf463c262e09115d5217e0942d27db55 +size 690474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 77e5530798..10df791755 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69304987029071b6d1cb718afcfbfb05ecad0ce63f888a0fc37d2be5727c82aa -size 640546 +oid sha256:8a0dc7bc031510813fea24efe0be7f68b92d4a0b4e1083a1e1b66f160f11f3a2 +size 630482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b280f56e53 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2e18c3006b63b8926f8cd8807bf0f0dae2d16fea2697a5f925cbae27fdbdae0 +size 781964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index a60bb4fcf1..cd897f7d57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdb96bd4367d447f6676a87c4b7816d5a255e02d39cafd2c739f71b60691f0f0 -size 583443 +oid sha256:56528ee887c511b7bfc77494c7ff55809a1ad5c3100e934ccb6cdde9982da94c +size 587389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e10207608f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61c485c05648850ab45fb78f94cd8d39a4649abc60658adcac19c0a73a818769 +size 642694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ee71dfa1f8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d55c2c66b9eecbe8ff3599c7fc46d0b58e8d66b765b93e88886741289ca70f3f +size 713782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8460091a27..b2e9f871e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8cbbe83b4f6a3f656c508fe64f20fa7d7a2fcf5501ec60327a8db6e734a9003 -size 562277 +oid sha256:d8b5d915c80c34d56f6fb7a00b2a5355ef690e171eb432ec96687149a63cafb5 +size 563263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9a0e3df285..37aa0c83f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60fbec101d66f64fe486d56b7de46db00dbb3b3340547dad6e597dfd739c2aba -size 813240 +oid sha256:1eda55f2521693d94d69a9f410eb067185d3eecfb3f535cfc48deac4fc0be51f +size 801104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 6269ec469f..fcbf42efed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13e54dc063c9eb48e169c41be659e369c27851bd3d85e353a6d7b7130f03bdcb -size 772404 +oid sha256:c698242376eeb7f4fa67b56eb8d4f0e445d1acfc4afaf7053ae5e84a8bd491fc +size 759774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c71943e631..c10fb28305 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4438f3cea0336f775cc0658751fa4e8aa6c277baef25385c953b0c639b8555eb -size 724136 +oid sha256:f8b87af2e21ccf6c748a15132552b970f97bfa23278bfbd4b605124333667bef +size 718364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 312bdd6366..fcae6db67e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94d5c6be621cb4b619a63c3aa282587d885ef9fae6b76b82711844e5556d9618 -size 682362 +oid sha256:685022a763e90cf484af37f3e7eac17f98116e6231c680e603332074777717a8 +size 675604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f126dd1f0d..9a8b7416dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:189825430fed35ed941a166089ea6de49d1fdd07d0c76b737a5faca72cbb940b -size 693764 +oid sha256:d528803750d5680d13e9297a55cf761ddedffe6d97bd5d28b49684526d073e41 +size 681728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3cb69a602f..945ef8592f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ce0638f8d565b9090932976de3afee7629976e4004423e1889cd7eccea02b31 -size 596421 +oid sha256:61b491d8ad1eb5075a09756701e11eeabd47d659d66f66ceaf3c79e2622fc1b9 +size 587491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index e4c407021e..42dfd87b00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c83eb2104540eae0cb5e045b5f0548be5ba2fa6024d9bdf03a59db7297dd249 -size 771958 +oid sha256:40454cee6475b85204265c2d3330086c8bc952423ce57f73eacebb0216b5bcc3 +size 758292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 663ed19769..228d89e8f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afc077b8c5bd14737a47d18b057b5ecc589534363db4d7c32d18711dc7240963 -size 677230 +oid sha256:aecb3ddc1bc45edf81ccee1a5b7e40276f988bd510bc0f2c207f9ec01dda9977 +size 660012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 32538216ca..a7bb891d40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e41fc71821d8d4cd0d1d275ac2ca8e8b46969448ab0a066dcb395eb2d0b1b4ae -size 700474 +oid sha256:2358b96571b8a677374d57437be887030796b8d66566c800b78ffd7f1bebd072 +size 688930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 152d6cbd81..7ae5ea285c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c0680c846f5bf7c02cb5bf461ea7e5acb3998a4d8bc837e4882a9ae1db04d3a -size 612997 +oid sha256:72602f3cb5c2ea15aceeb8f1df99e5ed3fe424b1b9d1a8189114ff9e1768bf3b +size 594053 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d873a90414..9c34d50527 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19484155d30953010696c757e7b89988a2184806e470b23ba8f68e31495513c0 -size 656566 +oid sha256:fd7934ec060ce480a813b68d40b3f6c4a4e9fafc094e3254f3d305c3390ed873 +size 646798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ee9c5b0b0d..a92defa6d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09e0def17ed2a2a261e9cdf89d45d99c432e7c7303ffe3f7b8ca3312254ff094 -size 558975 +oid sha256:8e195d8f0f1865ea8bd30dd36f69806ae49546d702e65122a47c3ec8308db93b +size 553105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4c6b03b45c..ec9368a47a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98bc67aa51090e3a71f139610fd7b5e8f0a2c126f1a6bc15591b07f80c06e704 -size 840002 +oid sha256:1b0dcd175b7e8abece4cfee01a267e13330e3b92230b6a368fa15aa94d4a7199 +size 825350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..515c6aca32 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b40539123ec91260c9fb539c3b1de309e9292a2fa448acef6bc9aaf3d0868f53 +size 785254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index de8fa1866c..5803fb55d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d420dff09bd1072e0eaab60f7c4e36b5fa73db6ffec2c9a2bfb656c751406b7 -size 750208 +oid sha256:47ca4362365e9c2973b170facb57054ed66b245cc1746cdbcc8aab3b4da5469b +size 742118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e25c649c6d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c9b44438aeda0d2057999d40abc24d1750925e3e628c17baadb06ad63cf71bb +size 702614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 85a17e0bfb..51574c849d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:176f2c101ab72525827585582e7b1d241c942d9c8bf2ad75c02728969bdf1e1c -size 613079 +oid sha256:16bbcbc795a4ab16ad512e7d8a6ad2c7f3e9ec276412def145f9f63fb339bb54 +size 627928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index b21cf75d6e..f69f5755f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cd9a9bdc7b6851a4929be8dd3878c07304c4b8ac29aca872aa4e9ef2baaac77 -size 508287 +oid sha256:0db43939ab72c3fd3d5ac972c34a86934f735363a8d78735e4974767ee881254 +size 525257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9be58e9b9b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da01d807fd131edc45a4b358680ab1c6550a99e6a421b3e4df08b5b14e083262 +size 698328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2b5a8ba109 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:090c132bfa380c1973e351b094f546771167cea60b7c701c16e4c3534b4add0e +size 591659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b47c30462a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a63fbcb8be52c11d381ae784caea69559000c74a253558fff435f3f71df7a9a +size 712288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3a2a82a48f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a84cbf2825efd21f7d86c3ba040703f41308fde46531b30a22e7618781789d92 +size 626884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2b530e8ed8..d965387090 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b2e2488f7e7a38a84ca0f24bd02260fac5da92a2ddd6433d19d3aa0e049a442 -size 585647 +oid sha256:ed635d3742c0cbd29ae93147bcaf122aa09998bfc4dc414f0a0cb0f727fa077f +size 595613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1e95fd0e20..ff8b31f7ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02176e579f6eaf41316e532d0a54a8df61ec93a801db1214002e13009b8224f0 -size 485345 +oid sha256:42cb3725a2b641408c6c693b2a7f145017de8f46fbb64030829c7ab383da7afd +size 494471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 80d426379f..eb11d4aec8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdb7bcc9eba8b69b65597a6436e17dbed2767de0d6fd17d1e829bc96c762f1e7 -size 755686 +oid sha256:560bcaa288ba1642d4c7ca5da3763269abd1ef80c083159d59c445937aee315e +size 731710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index be4820f282..351d103e48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0f360581a8edee7a47c59e4bb49f35ed354f001492b5917125b84c9660f506e -size 667864 +oid sha256:3d0f3ffe7a8163f35335ea8141716affdb8cc1fcdcaf832c6e16f071940e47ff +size 642458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 45213f0bc7..ab355e4eaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:539a77ed2ea3bbde848161611b66808b9757f6ac66f487425a7f15e37588caa6 -size 783188 +oid sha256:9641eea9524afe621b89999c4626bc544ad5f75bfa88847e86892a2c60fc150c +size 755710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2f3c3f94bf..f44030da9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bde60820accfb98587d7994dea8372de2b8dca92e42a5bee1072092923cc56b6 -size 694034 +oid sha256:b85060f94e2f9fff77d61c9b31629413a50267da31a9deb04921da7bdba1b2b3 +size 669812 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 903ec0b4d9..30c2d41e50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa319b70a5afe8c07bf61d84c935f245fda889e1c80f5c1ec7e06d6670a41f42 -size 759630 +oid sha256:fcc77e7d353868a96b52fa564eb8bffd93f9f30bdb1346f58521495f1b951296 +size 728352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 9ef99d950c..d7114ff46a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c060dd3a835c2560f9e929a416cf7377e16f13b182d9ac24b72ac72757c2cad -size 668208 +oid sha256:fc8bccb3fea8ae6b7690a2877947eb356eafcb03afd119672c9f242375d41d20 +size 643048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 680a1970a0..2c413dc9cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89acb63f634e082c274fe2453b457aa25506a29bbbfa664973be7297427a72f2 -size 779684 +oid sha256:b9b39a82c38427594de5ef47b7eb65962a8b3daa9386fe35cf1ea1c65b7e74fa +size 751218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 9f3c81e9f7..691cc65164 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7153aab12ede610f19bbf855361bac65bc43411e5e9318cb03a6aed5bf6da7e -size 694920 +oid sha256:08cc07c44056afa1f1bd0426db22b4a42b99c6698606ea5de4a3cd7d6f860091 +size 671092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 32980d7ea7..fcb11b5072 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a21d223ac7604ddd339a2ab2fc5b07ef19742375f59fd26220b04cdcf12bd2d4 -size 823650 +oid sha256:35d1a48401ec429334c461c4d0a1499b1763d828fd57728ce7d908c4810eac30 +size 796516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 44bff233c7..28694c52cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26fbb8893acfb3e8117cbade2c7e06ec3d07ec71f17fc5513dd15e703eb3537f -size 737308 +oid sha256:49ec6ff506446839c262ed1c1930747f883a4e7aba3341d5660e60adfc9dfb8c +size 712938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 614e9d4093..bb016a935a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c02c0a35a8fc88af360c17cfbbbaca80da6a0506799141639fb102bee51ba529 -size 851350 +oid sha256:698d36416d291082147e7f6a620dc9e40bbc2b5c0fee25283d6ec0ea74d2a5aa +size 823426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 7d07f2d974..431e60f996 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da2d83def30545955606a97da7a2cbf3e6bff4db71513ac4235d4a34478a0041 -size 766390 +oid sha256:8cab8370b5ff8c31daf677e3906bfd556c14a3845afa3cecdd3183a441d3077a +size 741526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e728411666..bf3abf0a5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23f2b3e91e9ca8d85a2207a305d284248f3dd35857bb615639aff8a005969f15 -size 882802 +oid sha256:326bc3adb86c0c14dde9417b2f41b5a34fd2c2e0c0110dbba4a93962f0608bc2 +size 872738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 84cdb0f971..bf11b32788 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6beb8c72aa9d48e56bac31e154edc07a54ee0a82b88dccc88823d848a299a8fd -size 702042 +oid sha256:a6e0c8ac2a815f27910c444043a85da87b125ba6c7442798b63fd47f8af5ccfb +size 693014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index d58e9c4fb4..46a5cf51fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22b55764bab58d7583fa645ced060ab74043894572f117e39cbda2cb9b9a25a0 -size 774118 +oid sha256:9f3b9d6326a0cd9441faa96645f0e16dbb3995d994edee45aa2cf651f3d9734f +size 762772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e651acc3f0..66b21c0e9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:861efee648c20a4cd37108eab1814d3140ba107ce5521b36434dc42dc475b637 -size 775550 +oid sha256:d4c87450ab671c8529d4250b2c055416bab669e879b3439a43bf1a9e9e3410bb +size 764598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 724c816053..0503964cb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6aa5541c363d8ac567b5aa20503a31dc6a1a34cc7a118c91973791ce3f7eb35 -size 656408 +oid sha256:66fb4aa4369a9df3e11cc21fc00c7e5111baaca984db0b371c0550ed7bec6ca1 +size 646344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..579982b349 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6be2de7b17c829ed695eed23fde6f9e715577027042b81d8d2636980cd3fb018 +size 928608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..41174c4ad7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4e06230354e35c0cbe4846adb465775ad378cdc5e8c9d32d5d5a7a8ab39471d +size 596739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..667a39fc7a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8231d568198cb9edae90943eee26b918e3b5dccc404f51774dc340ab22b8aeeb +size 656236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..79c267b187 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deae9efb18676ea43d1bef08deffd54ad84a33103a935ff61ac4241cf775fd97 +size 789534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6cd9beed4b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77459e33c9c879155bf74290db13d0fafed82a0393978c2b620e3fd8ab64dd1a +size 554903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e563064f52..82b12db62c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:522fa963b452bdcae0744c7c6326791b49617e231f83d5fb45a0be5cf6a50bbb -size 845500 +oid sha256:58e6cfe9d98894e0dec7dcd00b5c931ee1653b657647eb12ab22bf41f3e30259 +size 820242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c2944d2859..af6228736b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:294652812aef385685b8508e99dc5d83b327542c542ce6282be9ac83fcdbaaa9 -size 690986 +oid sha256:e728a7c36176a34753072b1dcd13c018e678fbb9d837fd22743e074dabc1fbc4 +size 679244 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index ddf797492f..35f0089e10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fc535efedc07cbf038a15ac3558035253c0af747310cbab4385e9913df9d656 -size 751568 +oid sha256:8382fa341bfe0561d64183bf2113728b8dff6593211fe51e4e9c66a1993fa699 +size 737508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 2ec6a11787..8764a54b12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:954e18932d033059cbbe832563f9e7d4976f6a73a990fa67c9e8710bf49b1e8a -size 717380 +oid sha256:7b92ec54197158587e27fa26af8b0e02434228294d9a5034ab050ec8241fd04c +size 704404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4053a28dc6..5d053c7b50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef58d686bf8b3911ceaf8c34d8d824d7f7a31d41592b11e24c57e89b35cbd868 -size 649692 +oid sha256:17435d9e44ed39c46c0ca63d14e7b0e6717955ef1c990a25985449a1bb428c97 +size 638888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6f457e4290 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c20d7788881394af31b2315199eeb7ba4d0eb7efec2afa786c12453123e7f7a +size 875322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3ec43b6129 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcbaea916245ce4804a68cffdad814b9d5e3d3cde47025cb393582be69be55c2 +size 579811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..60ec2b01e1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0caba5fbf7880ff41b63f158665c4fad53b1ac2f38fe5489853ef08d0e49b2be +size 631712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6a126efee4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dcc3afa9002221bfa0fccd39f1abdcfc8e64b693ffd00ee2c907860ed6b6aa0 +size 727764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..31b14764b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b05d24fe7bf7da165572040d3575f582c592d61ae41920d336b83aaf64e60b18 +size 547003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 10a0ecd59f..fdd8e61a73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b409b21cf254491250bb590bc23d56b3cd47a22c26fa500e6bc3d5838df7cc63 -size 855094 +oid sha256:f33ba8fc1aa79b4baf5876a133d2298102f773a56ecb0d0411f49dc6c1f69a2f +size 826382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 1e93434ecc..5a094a5882 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7cbb89582b132bf311be5f17c6b1e9a302421e0e54d88d23246ce4e2d48ac6e5 -size 836310 +oid sha256:7c83ad64422cd1e0e478ec6493d974acbe39f814f86e2342e2f8a8680267fca0 +size 812728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 346805679c..9d60f0c24c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23fff1176c54668fcf61b7897b45a496f1ed876b2a11dbd49794844a65439616 -size 761254 +oid sha256:4b35a9e35d75cdb3a2a95c2ce297f303fbc57f455b75477af68bb8e9ce3d7500 +size 731556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e95ed83fcc..6c249678b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94186d9e153acfceca0287a12099defcd473e088b2a1d2eb1ddff31d49717939 -size 740102 +oid sha256:7bf3c671f0bfa117ff3c54be023f6c264c973e59e98a1528e999463516364cb6 +size 717360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6959621e30..35ed272562 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0393d08c9de36ece02bf564c65c858d06a011235cc7a4b73446bbceeeb6787a5 -size 717464 +oid sha256:2f5167b67bd10e7ac3f2ce0fe2ba2fbf856c92b3a034de962fdf14fcf888ce6e +size 705130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 32217a3719..445d06bfe6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d0c4f82bfa6ee842de727ac77e3636f325cb5803ab5fdda129599bd6df86503 -size 609021 +oid sha256:599a59c513fd08b404d33d62be7a79b7845474b2bbc6a0d217a98832e39ae03e +size 600881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 1c0623aa2d..1d6c6b9095 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba3c9d8be3229b7ed5e4fc24ed90f37976e840d756db7259ef833e573e081e35 -size 805622 +oid sha256:e3847960f8c909f870c1933049a783336a22831610ffa95b9001527700ac6cce +size 791464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 88c8d0ac5d..afbd38ef7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7086243ea1f50fa3044dd0d19a14714e197d5b14847e2b054f5c11398c9bbb79 -size 691210 +oid sha256:f5fc044642488ef291dbd15c0e71a199dbd2110ebbb47447d75d6e24c81c35df +size 680604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 02d198f0c1..df31c47175 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf5ff5e496f612e285423d233a538a2bf3a192e8c7b71cfc8d6104ccd47d2969 -size 724174 +oid sha256:d827e596699523ae72566e78dd802937b6729ba87b059e03128c9ea30fbeb38c +size 713912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f5f2116a33..2806bdf3ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b28371e3f5eefa524d519be0cb3259fbd76e19bef98639aaf9ef15f13e407b96 -size 623278 +oid sha256:27110e3e71fbb5110d2128d28cd422b0e4fa9b6093f77ed9ce8c82227ac13280 +size 613905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 066d276701..669d9d88cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea0f9a60f8ddba231d45a09f246905c5aff3565e987fab42798c61027a7b977e -size 655844 +oid sha256:a12d8e54ceee35d0da6b3db5461fe999dfb7696fc6da2bcb149d8a726b032160 +size 656486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2fb2b51b6c..473728c9b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78bd35d7106d5c2ecc639dbf08c6e3f0a6d597c03a3303ae1c8c4cf46af7c88e -size 563187 +oid sha256:2f040bf2fcb920465e306c6baf59282ef0ac4c8890b486d2a5738dbb7e8bf613 +size 557909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 081063d2ec..79ab8db947 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f925ee9be272ec5119aad35cb1ca23345b1ff754e110e7b65572c17912883e0e -size 880130 +oid sha256:9770a717be44e732f12e96a149005d8c047ea1e5db300e87d09e37667fa143fa +size 849544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6aa78f96ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f89b2806c699829574b453e0037c4a48752ac94642dddbda60fb0e870afcdf +size 860852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 7dea3e71ba..97c6d0cd7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b70422b9c4b2f1ae39513db3a58840111b2672b2613328541fe014f3a1f28b6a -size 784860 +oid sha256:94101ae75e1eb8090f10ab4b0ce05352d0aad11d6ff17e3f923f2f0ab13d5442 +size 758910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..52fcd67fdd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a428aa3915891ae53155c6e2a0c8da059b444a850c0b4afc9dfb9683d600400 +size 773278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b5c54788c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f46ccae42e442cffe2471c84a56d237f9af355e2f1f95490922ff3065ca6379 +size 618970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e814d89df2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cea4385ac6d33d23ebfdbe8c80348e9e84e0bf508d0a9e7ba2072b8eedf6393 +size 515015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..42de86796d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51249750ef153d4d472f7c269237495eb86cab595cb80feab1e947ee6b0855a7 +size 693068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a307c03662 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff4c51a9b71f795174b31cfee5f4d271cbf8efb7bc0053942e4b907fabd7997b +size 581813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..74b442bb15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69b6cfe79eb2213dfffb4745c077e2d5e81d919665707fd0b18391421c602caa +size 738060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ae8a6d2677 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb7906efdcaa72204c1bb9abecde2e57c2e38b8d28b3d6d1a7df0cca7cbb59ef +size 637264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..71c374c844 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9d69856830876b916bd1a76d94ebaad082a1a8620a550273e67044a1f095f63 +size 569239 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dff2b4f0ca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a518ef1bce653a70fdfbf8b497e0c94ec505ec23d8bb9cf70de6ba65388d3c0 +size 477717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e46aaf610f..8f5dd3ba17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e58fdb46a0e61b510de4e4988944e45bf28d4b2c67a0240a5725220f68d10fef -size 878214 +oid sha256:9dbf937c0c6ccc13e44021074c00c0c7187a35293a8a499e31793555ec2ca3a7 +size 869778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 88dd3178fb..6f82eaaf8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7edd4156bfa9ee8b3e86c9998f0a56a40e2c7e5107eb125382756ce4aee73390 -size 921504 +oid sha256:92ac60bc456a185d517902950a77dc225b5abbbff6c7f1264f705fb77ff9f90f +size 917212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 02c3991d27..7c02405a94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9180c3ad989ebebdb134be67b0b336f9e727960d4216665093118fc7e0d30aa1 -size 844514 +oid sha256:e879e6f401a0fb870cb75523bb9573cfefb58d234a6183321a777eaca389e302 +size 820044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 5e6002b2f2..af40c5ab10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66d44deaa4a88b7fbca63024f16ebc1f938ab70abe09d9a61cb4c4ec3b866799 -size 888542 +oid sha256:c25be79600f8ca1baaf291ded571abb085bc18599441be2740bbc4ff342e6f57 +size 868218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index a5142b0584..418b6393c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf05c36b2ffd950edbc5878e88a2772b4663b68154f647840613ef01146f0af3 -size 821412 +oid sha256:b90af4d266a89134ed9b689271c8771bf599ed789361e8e1e639ff0b2bb43086 +size 795462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a1fcf6d4bd..2a9d1a767f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7147e648a3a51f7996c3238cba796526097d398736e3e5e2422f34e36534d977 -size 733886 +oid sha256:c24c9231e30673c85c9b66d526701dee5e48adabe970445a5f81ffedbce8b38b +size 709318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 4406a5cfd8..26cc00ed04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:618f56614a56c44a4602ef1fc18a093682499a9d9303c2a8ae709a2d64ca9e75 -size 867118 +oid sha256:30c88185c3f6ec4d8a3968a9ac7462e06d5e8af81882b2b4838e99c72dbff689 +size 842994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 67e62bda67..b6a7b75a77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38f54f3daa99af6d8af5c7f6d63eab3ffcd167183592f4525c541a4be6bf6c42 -size 776484 +oid sha256:f779d31ec22ae55ff78f15de7526bf2019b2671f0556185fd8aa0a75f32eb9d6 +size 755764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index c329682d0b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:80b301143f8fd3b31c01c0956eb2ae26434bca8ad8706f3a8f2f6633a4cf402b -size 595405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index c3a73836b8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ccd4eb10f3e94bcce6b8ea66de8712b8a6fd5a7eb009935f8fb898034b033a7f -size 554801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 19c08bb0a2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:30e0cb5f67565b98f2ceb44702ba7c5da5776cf31d25487e5e56e7e7c1e4a798 -size 582129 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 6280980f80..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca4e523b237d811135bfbe803b6f83175cbcc02ee3dd5b6b3f30ed6d804ec449 -size 546903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1ff3ecd942..efd6d1e469 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bb73e1895659aea2857adc994cbf08b8916be6311755fa5641626b718514280 -size 853266 +oid sha256:6fb12486c0fb483432698f8800acab7671fbec790d9adad7ae57bcc70c305312 +size 825146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 38062312fa..46593f8b60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49012586c8690eb7b3c39f5e0916c6c4a08bead11daf067e449bf3e9d8593e2c -size 763670 +oid sha256:caffa49e8b8cfc4951e4b466f5fd9958b04fc392be0295a82d6753a71930da34 +size 732194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 09f795b2f8..f7d496e126 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01762ae4b0fdd03faede44d845352a705823cae793561fad159df553006829cc -size 877514 +oid sha256:a5e282bda7b31b2cfcd0dc2accbfa2dd4dab827c13e2d5c989e1e138a5b39c70 +size 848506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 1256bc6487..3828246c86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af9ac553d578a806a26a9f77a99b2b0a8e9de1d4d3da9ae84d36ec70e9ccba2e -size 786238 +oid sha256:332f956a08893131c45c8d5bdd881c4a5e412ab9ef5b5f6bbc58a27ea74ba372 +size 760240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 9ef7118f49..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82ba478e3044f17cdbf9257289184f39f4a44dc919ccf9e3aa6ac81a1a3a3c2a -size 602291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index b5971d2200..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:905c34fd9041b329b75b2daf7f6a73233e6512bcad07a4eaf1260f49c95a1302 -size 501841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index bda8669ab4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37d131324fc34daf2e5dc8414cfec961b2cdb3a777bf5e0dc63e7578a8089275 -size 569137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 701546d954..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5b9710048bf50ab5d71497f4b713be20d1ecf6aadb766c4a16ddda7850a954b -size 471697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 527dd48f32..7f776da6f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f4f053c2cff3371aeea0bd1b729dd97e4457ce3743fe07409b81be74a6657c3 -size 932956 +oid sha256:e94fbdaee89ad0499812db0d7b374fc2c3df6d9d19823621f85462146dcbd664 +size 923780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 436f54b4e8..2d9ba8cc84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75380d72a706c4e6ad8aedb14c307ea758262d87167eb4d989fe81392fe97707 -size 731724 +oid sha256:a9f0467b6505c6f1533982ecd307b0683e6a8ba86b6426ff48daac9bac25e701 +size 722450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 0102a0f01b..b049a2a243 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c01556e1879565a0b0ee09ffb8f4b696687e69517cd694954e6ce7a6eca20e05 -size 808782 +oid sha256:0216f8009c5669ee7e6aecafc8ad09764773de370f6d26864c100fdd0b0f1b74 +size 797930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c867fb674f..128a1dc76b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba034c18ebcdabc99ce92c91b6f629c283ea6c6375b827090bd128dd6d4865e5 -size 800544 +oid sha256:502394f8c505a0e10adbc41e4ed3ecc25a84cda1a60c8520eb5aa31df9dc4561 +size 805674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 61eaa0ab32..f7a19186fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c50c89d045ee950ef5487b217e733d1c8d563b0e63eec816033ddebe837320d6 -size 682882 +oid sha256:02fc10939e39892570b1e4fa35bfd46f205ea74303381bde38df194f54ee9d0e +size 675186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9a41b880c7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b5ce639554d9386135eb5e80c7e19f49ab47d7ff3cc5381a3182b2e075045a +size 971954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index da3a23ce95..e1cf488a63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aec93bcd7a94311feabbe4974c7136cee1e180ad42fd1ad970b81fffef8a8dd2 -size 620106 +oid sha256:618a0f9d54f6813cb446dbc4c4778b3a00765e2ee5187d2a00324ccbb551ed8a +size 625386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7d45db4206 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21525e5d28c3db7b6548fe692826e3960b1dd7cc30ddef2a963c8ad422f7f0e4 +size 688828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c5af377fdc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20bc6761cf905db1b0e08ea4fe57af3e5155ebaf2a462c88cda5296069cda664 +size 828984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1fd769fe93..d986c3b8d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf22145b450edd6790b587b2182006ef3345d3d0ad488d09e6b172b0e17481c2 -size 580341 +oid sha256:e729bc92649629bab90d0e38b99bfd5cfef731f408c275e83efc7538fa320506 +size 581377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8aa8116cb8..1e44aa1d17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:736aceb137612956a6b1c0985d8cb995d80ae6d1cf7a23d94835a4261b501ee7 -size 900736 +oid sha256:a1cd7bd26f59ba558e70db5081b6687141e3054518c3c7e237e7dd5baaa34249 +size 875082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7280b9952c..feb36b178f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:660863e56b65e2d7237efb7176ddcc4096f376d32d003d9fdfc39d251fd97ba9 -size 720668 +oid sha256:e32c1e4297ed05f9b4265ae83a77462117e9be4e619c47b18c267e6102ec6785 +size 708680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 23c72ebe46..e98a168b1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:161b2a26e796326edb0d9e51d41bfb432c5b91aa1ca1a219ff0e63f1f267c9a7 -size 786232 +oid sha256:69c18b96fdd98008fffd56853684e689f39f454a3b49f4272a0c88af09137207 +size 772664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 5c75dd413a..da75c2a826 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b623129ae2318df6f4af5217936903aea0a5fca6daa82b8c4d6daf203d6200e0 -size 756236 +oid sha256:d331aaf6121a3aaadc2ddd018f32ae8100ae556676654d8f2f2529fb5ef54261 +size 745384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e7dda39a60..9420344378 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:988478f439a4b10937f77cc104fc19cf2f3ae7379dac0e1709aad0b10feb3889 -size 676956 +oid sha256:1344d944aa37211c72c6a56baaec508f0ed3b1529374b938353bfd0fa240e8c4 +size 667682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..62a5de9186 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd296bc87fcb9689ce548c3fcc6807d4e2dbfe3e84b389f5fd60dd17d602fe2e +size 922466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0fb0d870ff..c17a2978ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a82412d160d6089eeb8567b8d3e0c687c7fb539c44f1bb408909d79cb70f56d -size 606829 +oid sha256:4568330d9d458bf5f7520f2fe912d91879522787b7d97fee5ddd577051e1f3c8 +size 608457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2f09963a90 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e468e429c37ff0f3e791f1e98834da35509953a0e039c4aee02aac3c90f184a4 +size 663564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c2391352da --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3317caaa079d84c69a145955f91f8f88cd738e8287ed4f8e9ddcc50bc2437b95 +size 768792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8bb5b53301..c73bdb9711 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abf2b1d2ccffc5be31a448b061073d28b7ef887da6b3d30072c2b5bb5d32f0f7 -size 571553 +oid sha256:1033372ea03791f73ef5c85a99fb82d99c579813555d0769ff719cb16dd50349 +size 573675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 27a81faaf1..482d198534 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f4efd19df81681b7a5546848383f8ff992137042edb8a377483f013d26a6b39 -size 927104 +oid sha256:c58fcf1c1baab633cb6237124481639ad3cb085dffaf394d73a661eab672ac9d +size 895826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index c176cea497..c1ad5a1b8a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70acf74351454f7575faf4dedbd855346668df4fc5e3d4e7b010e3578a9158b0 -size 885428 +oid sha256:aacd4431e9f2effa1ea3bbfadd71f1b3dfb7c5d17766e3487949dfa1af819b9d +size 858444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7d1f46e138..009000d7c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35177e98852aaa173b90adc898faea39ba6922508835906de2acbec03e18c4ce -size 831142 +oid sha256:f5b9d5e6c916779e25e29f2ca20c2468d2eb4ae8b3bfd2c9111d116968568fdf +size 803022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7b9d75402d..3580310c66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68d34e5e3b745ec61b6aca6a921cacecf05cde25688842e09b5c41be08d6a140 -size 791638 +oid sha256:703986d0f61fc962c0f24b246f099b96024b250b9fe4b9176a7e9a156686d6c2 +size 769092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5443f2dc95..1629ebcde0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75d7265527d71e9660441bed1b13f2f468de7c23d0307a7205cb160f47c9110b -size 746800 +oid sha256:f544b1ed7de3cd797a23d04f87889f163c754790b30cf928dd4331df0083cc47 +size 732394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a20a6141e3..16a8d44826 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19e481510ce3a2995ae3645692709813753b45094fc996cd9b1364137f9762b5 -size 636434 +oid sha256:da0a25f4ccd8ff17328ef9f3159e591e1f91b1814b146b32445d9bdd1a7dab82 +size 627504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index e664f04ba9..b8a12295ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:860d8dc8c4c469a8fd3bc5b83f343c0a1cd8dc5f825c9e50eb0918fa44062a8c -size 839892 +oid sha256:8959039f2a90f10b7c84668f4a568a4c7fe66b94c586973b652684a5deb37e1c +size 824302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 42ae3d4be7..d6905bcb20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8f36bcdb54d4ab336268528f8c7fec6748c95b6a27aa0204abc7a319b98de97 -size 724592 +oid sha256:3f5198be7d32b3fa401edd5a186bb6f73a527d05c4f0bd55587215d79d62bf7f +size 707622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 1cdcfdd01f..f61d342b46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef3908d9a46a759dd017e8d0c2ded67dc27759da664ca169047837e047aa12b7 -size 761798 +oid sha256:a692fd5f3267d8c05d74759b9cc79d50cc3c759d8e98207f31801793aecedfb4 +size 750352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 47d4e400b3..6df4aa6831 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9089af0df4a83a333848f2b390b78e60ff707735a67648cefbaf2e09f4899e0f -size 661938 +oid sha256:80876d74e58103606b734fa02730c7dd30c211b9de20017f74493665eb51445c +size 650690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1490c92082..6a53b91f00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:106407c51ab474198799b25fbc814729883cef81f5f3a7d0b45dadf2566f763f -size 692680 +oid sha256:02f5b876bcd88070a0e01035d5359869e5f98cda5c89644a9318da1a6780b786 +size 682516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 941cb2c5ec..65e5c9b1e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36d31f33d8e4c5afcea306d4f26fe592408895fafd7a25558a51ad1efe33bba7 -size 588281 +oid sha256:9bdae9c140d23d20e92c800c26dcc749570f471ca57483462242a3e4b0f9a50c +size 582311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a7ca63ea6c..ae263ecaf4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4960b04420d4a1ce44a75b60b1c82bcf745368568a6ebbda63e9ff66f4a6ce82 -size 952386 +oid sha256:6dc34d92ad7f23b3a6fe48a65a0a939234d24f1e140d7c341ef25aebdde246a5 +size 919876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0865f42c79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585620bc9d65ad76fff0d45c038ddee321007acddfa7c20ac94ab0f7928075dd +size 905778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 6ce6c5db80..8a9089f737 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:833e2726563ed8d850614219c56d5d8188dd4c5c219097d9b076e44f8ab13e4b -size 855882 +oid sha256:5640de3227efcc339a4828768860d7a3de10efb5790823779002f47bd7d75733 +size 831512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..10e293fc0f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d39ecb28177a4be15abc033a7dfe2bbc60b043b159e868bfb39be22cc1c54080 +size 815292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e5e87f80f3..fb6a3982b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:164a0e00b16c1ee927d0f7a166495a86b80dbcada6335abd1081f976f7ae871b -size 627092 +oid sha256:45debd65ef0fdbf8c9749e8c2c74b7059edbca9e1bd59ccd906f8bcb43b53f6b +size 648404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 82f7309d3f..9a73d2efd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e7a4db74b5fa63ecac1227c4bbf9d90f499377d1201b9d1f11cf9715c84403d -size 524025 +oid sha256:63a7abc1e2d52c3649b94b1595c5f5c4c864ed6e27029afc70d270d18b1c6a7e +size 544401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4031c6f062 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f197409630118b8a0025b5f975ec5925f19f23d871a3d92d812e2ddc632015d +size 727338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..db5d24c772 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b9715f5df8c159d4aa47db0d862617aa485143e333edc95a0cf19eebbf5e457 +size 616279 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e005c9f6a5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:765053d3195d0fb31a82e98e2c11c3f736364b7018744913f71cb8103ee0daa3 +size 773710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..713b390505 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5ccf2fa6553c5f0b1341895144c80bab89aa12e2215eb4c3a713d30424b68ea +size 674838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ecd06c8283..422c75cd07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8745d57deb568a34685239277c01f1b8d5ddb291acfb5fdeb89585319c3e558 -size 592655 +oid sha256:741a69289a1e0f2e2117352151597da35d6ece03d68b95674a16871f74d009c2 +size 603557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3e03142a29..7edae1f2ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7847a98b2dc0ada6f2b3c3ac1d6666b26a3f42e4d51c99e9d0969f7221b7652e -size 493831 +oid sha256:749a4788dbefa022822eb4b0dec5390b81f112bbde5377356efd5acb1f70a411 +size 503995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b83d674867..10f1dff243 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db30085e6ad87df6d722d62c929a510002e87a792b5668af12284a7c0df5a32d -size 695302 +oid sha256:e9401fad8e04fba9f23d9a9a83e04b7a55a5d9be8274c18aea4063d6cde44235 +size 681636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d118eac911..24625ea2bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e35a5a508b188d18388ea711f1d22709f5411f7a99284116d256e80144a04a15 -size 593863 +oid sha256:6abb33a166315902e29cef351308ade82433d31292f700783947cea524ba0fd5 +size 586857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a0d9450105..3fd63673be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba74c1846d2aafa7ce80d19c0dcb3573d152566ea1cfc53236db9c3cae37cbf5 -size 716884 +oid sha256:b3b442f3fc2873204f190e551ce64f820a33fe128e7972ffbb214d0ceff994e9 +size 703120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index cf09d82f1f..dbdcc53231 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70fcc001595ee1a47c76818e0afb218bd6612968d36dd2f1172e356f20952b23 -size 613521 +oid sha256:3290f950401c1211ae314f46ac1f153ffb5d2237096a4f77ff55b18808013287 +size 607157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 358329e36f..06870222e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b77f040a660dabdeec21bd454c49248a8f218bed269bd631e80cadc5619406e -size 691204 +oid sha256:d7ee0f7a0adbac6eb9695ae8ccc51c4c4a6a1734861b186a29a3b93ca87c0dda +size 682324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 97c0109f42..fea05b91c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4437a357b0f70c473ae40039bf095ef4f60c3a36aebfea7fdb23f76fbe50de0 -size 609647 +oid sha256:28a7a5075987bc487869fd87882624f8b696ac3579f592ea574b5aa3f2a64cc4 +size 602001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 86295c6335..3a05394e12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f97c896284538f56db207acf127439658839fe596050448bfce72f299a113dc -size 710468 +oid sha256:df3a57b1b017b9b1552199acc15eed40792ebc2245ff884b06830e9e1947b092 +size 701342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index da409498be..3572d7692e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:168fd273b6f98e52c0f73c855e24ce4afd98eb9c127abe837bfc6a47baf86167 -size 629554 +oid sha256:4fd104a362e8667fa2b1e6341c43b867095416bcfb36c1aa810cce592d51ee21 +size 620872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8fd5bcd3b4..bb93969158 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa0d4860e94bda6d54623041b79e97639dad99918be4f594aec95c0be02b0005 -size 760798 +oid sha256:88c0218630590d0e63e270c64a637ae5f01ed77460314d7aa50b6e809c79c961 +size 751426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 43c7860725..176496ad0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8eeb1ca5f6de263769ce82fd73c58426d2a069f50d3b21c92dd4371134ffdf17 -size 664690 +oid sha256:870286e5dc002d7a67fd9335efc3fe3fc388900b59c672ed014185aadae69729 +size 656006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d0d11cbfa3..6b2d778359 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3e7de0df52ae4e2b6186cfb38ca389da7de2bc3aa93af5202cc2e47ea07807b -size 781690 +oid sha256:b6f89e2b0c034c1ceaf06a1cf65306723b055ccc6371fe09a2dbb9c5e5585463 +size 773600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 44f4057a91..720b8de8e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6aa527b75efb10ffd94d8be215637e640d6a3540f772924ccce69487cfa94086 -size 684200 +oid sha256:8c560c56ab1cfd06280e8c7cb29a70820a3b7e9a5430f60f0474c68496565ec5 +size 677046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a7c22b3121..13eb42332d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8e4d85788dbacee84f68575af5f9f01629a5abf8a9f4350ec0427f4a9d68313 -size 851130 +oid sha256:ece181e7597e0a7972e36f5a5d7121345392104959123c4a7d674701a4312c41 +size 842250 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 227df3f701..787b6ee457 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75e26d81adf7b41a034126812c36a7bf6a3a6ae7d3dded2dea300b423b9160a3 -size 846146 +oid sha256:427081069d1386f47ae7df317dcd6179f09cee11cc4ac5aed0bd2bea5270faa1 +size 836378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 465c59f421..ee4d57eb72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da33c97ee9cda09d3d9e248f5ef37f900917ed6d4b842850e4b19b49717de7ee -size 931640 +oid sha256:f056c8e52d2f039f440a2b10cd11d26448858b4f0f15a4675ae9d0eb0aa80478 +size 919898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 71bb35dd0c..d5dd55a892 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4f1691a9bdd7ede16cb074fe034945037cae314465d078ac8858a2b71317a5f -size 780730 +oid sha256:f164a2c98fd6dad579d09a5250d36c67a9d7b21c74fd7eb3458935434f888bc0 +size 770962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 70bac35787..00c7154615 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f96ef76f97c89cfd9234fcd225b21a9c9dd0177c302df7ab104cc455733e438 -size 802434 +oid sha256:9c69ff2fa37e47727ec993381b4b99fac4d2bbfd4afc5e408f91c5c0e3c0dff5 +size 791680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..220914617c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6271a555a4c73f49527753c222c90b9c2ecd5b98d8b8fccdafc367c86d000c49 +size 884996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3f97547e37 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2051eaf0fe0114811458f27bf6f379fac4882ab41b268bd237c56eed907ed1d +size 661070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e31ea2b988 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7877570b201298bf4f2532e289bb722de24eae41eaa5cd9970e7a6b8bbabf73 +size 729694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..495169502c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3628305ab328f806daab8a0e29e2b65da35faabe359f4aaea281210f94286e65 +size 793234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..43cbcba8fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05aafbe8d0adae7d610f7f22f5804e3c6ed8848282200f9011f33aef2487487a +size 623624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3f0bec5bea..84530da5d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2e970705a1f0ae969161791436948689bfcd80ea57eb0a6172e1c2a6e22d0c4 -size 769230 +oid sha256:f8665cccc026bdfa87af59c6771064c601dae71bb146078e819c3cf5a74b5a94 +size 753592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 32ff4de2aa..ebed2d9a3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e031cd52ba3a05dc1448c0b602e5d50dd1a314d3fa31273dd7a0cb398f569a7 -size 828528 +oid sha256:d6d852a171bc642b623faffec231b7fa404cb37bc937480664e0584ac88fdef8 +size 816934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 01040f84a5..0d02cccc3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f212097445b94ba31ee95e05dfe96f653276d63d8b7d7a28b641d2664044c243 -size 900900 +oid sha256:84a5e171287a8e25ca7a3d48953827f3f675b3244cbbbc1c8c13573f81ad94a9 +size 886494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6484df6926..b5c92a16e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1e7305f27e0b180fba532bb3b3a29c936ee5f7650a646cb088bee314b6d4b10 -size 714666 +oid sha256:5cfff143510c4b49d6b990beecb502af2bb1001fba0bbcc68706849f38b5e97a +size 699916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ad3eae135d..f211106b4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bc2a160db75a37062257db3c0bc954e9b1b2a744e9e27006daecee05cb68286 -size 792166 +oid sha256:7692fb572be66f033ba18c57a3ff34bd24203b184026002cf800f14ab386b68a +size 778502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f8dfe61686 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b74bed0feffd319d266ce9ff6ccd04276d94dbc242546fe83052e3b277df7249 +size 798756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..61779a2d49 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a5e2534b965dfb83c08ccd31e5be465727388f25565cda5c58fdee692e3e498 +size 641824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f98a071f06 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dec74c3d8ecf9a52d38fc2b37e7e222b35412a3bb23e45151f44738db7e09ee +size 699496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f8ba88d6d3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a16e88dac49c8363e51d9e9c40799306c8e086f3948417e943ee7d7860fac070 +size 721596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..54d0062efa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16e69c27d60d422f3f3861a39428ba2433d11b64a04c2d530c4d68c402fb5fde +size 609805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index bbf17241f4..f43ebbf46f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e14929df707211caabad30e9db58e9fbb62a12cf7e430c7cf3beebea465108be -size 802948 +oid sha256:c352b4d84c0a25072d687fdffc241b96f218d2b7c2f0c018890f6df106896997 +size 791552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 1316fce347..39a13db9b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b71954f682c7fc38e69ed3b65b7f33103ec873068413c31ca614c20e44d3b9f5 -size 769858 +oid sha256:d2a199937125b6c8acb3d40d88b2f0bb8760b9ba20a3fdfd3e5c978d0b026e3b +size 760336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 86328ad506..4d889efd59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faddf8a1e2a53506efa0571e9781657a6651071fff0b2436e68758a5925f6001 -size 702350 +oid sha256:34553bfc5cd726e25463ae897d763eeeaf0b6fcbcf704f8e21396ed90c321751 +size 694752 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 75ed8bd49d..b9c9bc1e96 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:730fe780ce2599b5b7ccea2033cf0d389e2f6abc08eb59403c233c716268ea1b -size 669358 +oid sha256:fd60df061fc02fb40056d30eaff106f9716e704e86b9cefae939e52b862c3128 +size 661760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 653b995d71..f23779a6e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd8cc69a29abd91f6c4fd3a675246937b012fdea9393b7860eee8454b0bf027d -size 835568 +oid sha256:1d24fad6b5aaaf03b28abebaf79b7d1b20ad2c707834a17cf836f053a2dcb9a8 +size 824024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d633d2013b..9952282f91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1752d965fc7e4c40a68f1dd53c3662f84fbee761abda753a13ce492c41f3888a -size 725350 +oid sha256:7c1cc140f6de0591a027851172a0c799678ce81a3b035f2999181e0d80f7e73c +size 716518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 337af92d3f..995fd29c0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:604311f30afdaf68ac49472b5867ba9e9f48847ac848057a2e8ac0ff1c3b85ed -size 925454 +oid sha256:ad7b16822c786bf4e3637e0406975e87e8ca1be7b9b1d968e140994edae5f0bf +size 919188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 9b7a3ab250..5bca4ff9cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b8e1e6ac861e24f5aee0e2c6aad18e8f9f08457865fdf5fa4ea4a41b8d74859 -size 822388 +oid sha256:a732eb246898bc94098649a6359196473ada83814076b38f1ea8c9ae876e237a +size 812866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index ab8649e884..eeb060a599 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e29db872b6af92a50e7e679fec588da6c3ba03fd458d302ac57e62ed747781cf -size 716922 +oid sha256:4bae1b3f1be2a808eb941cc066a0ae9845e45fac3ead907059845da7b0105b8b +size 706710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f86d25d2a3..346d0499a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ae3a20cb1873b7ed3660d6ef4a0feaa446853b00129a52ef96d4896070c9f3d -size 613559 +oid sha256:3b8e06ca1403f68f85b0fb5298f92f53ba0ab86b5ea92a435fe7755cd2fe509c +size 606159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 844e411806..bb3df90ca9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26405d33759e4cdabf26daa0411034657889872caefce11dd879b1e48360a35e -size 766500 +oid sha256:80b3a3f6d6ea366cc44d2c1aa70084224977ad7d5bf863bf52dbeb5c3d7a9a95 +size 772074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4137382fb9..c549f7a24c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:443f9b31b3cf08c6961f89ed9da22e3e31933f4a02364249d63abb316cfc55a3 -size 676112 +oid sha256:754306e77fab919ac9efdd6ada2d10e86c5dd97a04abf020e516ca3946705f81 +size 669896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 51e49ed441..5b99552f6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f1e339b61ff546dfaf883f14d970b2085fb82df034332843197643baa9c43d4 -size 822114 +oid sha256:e1695921439180d01317a5ef53977fac47fd365be380f551111e85eb86e0c50a +size 812444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d863391935 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f6833a4e262c40e8f3db8536bbde84fc3069d409e8f3826c7321ef81af7f845 +size 804760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index a9daec840e..f3d2ccf4ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a687de0580b0933d0b03dec9ed955090fe93627367bbb72aeeb60e1068cf125e -size 725560 +oid sha256:f181af7686e73f8b95c04b1ea8ea2f44430582bfa9222aab0d7997f9451db4ca +size 717766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ffec4f87e1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e35dcca5bcbd0cebd1a3d48e2416a919cd4cab80eedb709f51acac4474434a87 +size 706284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d579e36d0d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa04da1a0dba4b771e33398197385b55ed632aa967cecf288d0869ab92a1a4bc +size 677578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c26835a03c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ec78868d03d7d6e68f5837b885b84c78cd51b1eef080c652d888c557088f87f +size 575793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a9642603e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:474db6717df0de7ddd8a9be77c4ab5bd8e38c5b166c88be3403bdff717daf4b1 +size 753156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..48f736d5bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c9531aa6131f8027daef42f42c15f6d8311c03ea454b5ea6cef18ab48a88be8 +size 650634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..56a24920c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33af2f0a6c36fe587c1ff172bc42bc23d150d2822dcba0759e04e464cf8f4a40 +size 732042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..64edec1f73 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe0c338ba4d0c115d95b38b7eadd05e08f4cc17fbd13704f555bdfd7efa7e149 +size 628384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8fd7230cca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:089fd5e263c8ed20ac906a81132de93e9f9cf37538d2cba2612bc31b5b48e855 +size 624838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2849122330 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ab211e87172b1c699ed8577cfbefbf881b7bc9097bb178ffe40db124fba68dc +size 534895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 2a70ff3545..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eaa35d16647bf93992fa0cf7c53281e564760db82c76ac5e75aba1aaf34b7c34 -size 661858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 66f998a22e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a78fa402ecebf656736955a04b4accf32812f453f0783b8b1ea6590572c2b293 -size 625498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 5bb7064a5f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a576220b5e64ed2af57e874863b6bc810b6ec1b15d9f2f48183658b764c179b3 -size 644338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 1cebc3e11e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c2d6bcfa02b12b73e08e83eff91890010be545f5af1c77fc4412feb4657289d -size 613601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1bd82d143d..31e1b8e975 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db9e359f2b2ee7c55551c03545ca08d04a14812788ed4c14b9c7b9cfe954720f -size 802996 +oid sha256:1c5f119bb9856ef9f7c548159d1ee6312f21d51d2906dfcc7728d53667938adb +size 791402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4eed5b3583..f6d41294cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f5675e138c425313da6f12e58eb4c28945c8253fd64df57adfdb974f4b316a8 -size 720454 +oid sha256:56532536f17be0a16a16e1fc30b2eab1504051ba698895741a8f40fbdfed6f64 +size 709846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 17f9668de6..56c8088027 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4347471f93c82abc4e973dc0b8c778f935c158ae2ef0931fa0b47668fa3fb9a1 -size 825960 +oid sha256:575f863078bbbb69066407d5fb27f23ea9d1094dd52f6b6aebcdd619c1654fa1 +size 811802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index ee651aad65..941d447e05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ac7f61779c1c78050c3a6ed0b3bf72ec2626d8ccce7e7b87812deca39e5602b -size 739570 +oid sha256:f336ac9c7ae7a8fcd26685ea7e243fc71609a96ec2ebe073114e5c877282c495 +size 728666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 9220620fcd..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bd483dc09f2b2fc68c20af45a9fc7f854c4c1db992a01cc5e399cdd5a557842d -size 662628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index fc0ffffc3d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f6f5cf6e86fa9a9f3a3fcbfb834d4e076964dd9b06f4807cb2a50d16d6c6dc0d -size 564051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 6c02e1d859..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:471d78de818b804a92e25b77b30142dcb73dd0c1c667ec868c0318077ffb39de -size 626266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index c25b413027..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75e2ea93b5faf4ef392b5a193c679781dac29320e9fb95c984b1958984bc3a74 -size 531833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2bc34a14ad..0151077b92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e354e1b86a31f1948727f663f7ea1c8500b2fa0c58448bf02a76e0a414c5389 -size 902912 +oid sha256:57e0de8a6a26278f2c8ab2596dcd499e9069aa3496f457c0878d0aaa3cb47c50 +size 892700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index dbc11b2bef..b40c6843c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f94112e9fd1cbf5b0f39f0da808a23587b03cb58e25d5f0460a27c4c5b5f1c85 -size 874742 +oid sha256:9e1e85cba094f65601a81d0d050e841e06e954761f936cc845bf4aace50e38b6 +size 866602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 2f68fa038b..5c740c1e86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b197443637164a82bdd02c1eb1166cc233db2df06828558553fc8e18153710f -size 963640 +oid sha256:0743a3b2b22f77feb16c0138dbd353e52d14d022fc97a081bf1f8499de2286fe +size 951948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6ce4bc9726..bf19ffbcb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db98018b8c5719da40351dfd4bfa8d3d8fcf6136f8253d67de932a9598cbe1c1 -size 818798 +oid sha256:2f9ec99cb07aa6cca69d4e07f4036a06a73ff0515271d483654821a161c3d971 +size 807550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2627510676..900051db6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a77bff70726823153debba1d1e9720a050d1b263d50a63037d1812ee35c80e9e -size 827872 +oid sha256:510fc7a613e07958de42b5b34db79bd7c7c5357d1db3c1e15703d1f2156f3cfb +size 820522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4056073703 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8513f548b28ad4bc42d14cb134c592b7e970a4d9fd3114ca5ce10017136dedbd +size 917244 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1834a0f57e..69ccdd1246 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3b11e4b36970330e1d42dfe2eb848f8bf49dc65d53fbe570a7d761a37a7c5cc -size 688138 +oid sha256:e223cebc35c488d290092c0af1d97c67a97391857fdf24dc7f6a060efd2718c4 +size 689716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4d13d5ed81 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3a8c92aef0fc712fe84b5db3744beca848852d4ab51c9bf3358821e8dc1a061 +size 761546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..95d3d35c87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77e0529fda6d6899fb852766837bee152d23968166f400616ce0202926c70dcd +size 828984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 18e35ab3e6..f07834140c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01fcfc04552677a6d26e04c9f0760213414a749bcba1a31e29e832498f58cffa -size 650938 +oid sha256:625979e2a3ba00814200e8bedf554df7d7dd75da2743c6c37c0094ef1f5179fb +size 650050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e0ccac0784..d1bb3ea34e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbfa7b0f32444e6e156260d1fd2c2b11c8b41679df9ef21e6605b7e95aed62a7 -size 820470 +oid sha256:9891882369878887c9bad4838865e3d9722b4c715c840cde8078c95fd0c5bfd3 +size 804832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 66bdbddb10..99b73e5bb7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c3334ffd4534f2e797afcdd57dafba90b1e4824505c2d0c8df5ceab1e9cbc13 -size 857172 +oid sha256:88af9b1543760ac64e53d66096d4aec47c4798811c14ebd0b2d06626860b51ad +size 846368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 6bfef997ee..a5e6746edd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab0e2f03d99a10395df3e370b23808bb7de77bab343dc9838a59d8fdbde402e8 -size 932702 +oid sha256:2eabea4b5b79c84bd5bebb55a3cb5c9c1fbd3f300c58637a64388fffac0a14fb +size 918346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c0313c6445..e8c812503b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93a3b1143d183855c746d00441d28cbf4d8f56482520a22b7204851c92cf8f21 -size 750908 +oid sha256:bd432cd618a8750d710b1d7e82eeb6b7e1589a7d7478f252e0436daaecc4746f +size 735912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6fd3abf119..6c45f7ddbc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9442950f9cb0a78860d8f3a230a08b1c8ac7930075686b6ac28f4873948b9335 -size 817556 +oid sha256:30891cae1b4a3ad02352dbe0f5d3deb407271e61f3c5f4efafa3bd7220e10c2a +size 807344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..acde11b4ec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24f234dad2f3119315cd8ffd31fdded469c243dad195bd8635096e93e85eeb91 +size 830164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6618526fc3..20ea7177ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13a8b1fddf05632c0d055d278f335cf500069b4823907f7d8d371f6d01dea471 -size 669828 +oid sha256:62d29433a5c0179013c5aacd1bc8b316b3e01941299d083382ef772f291452ce +size 670420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b695401209 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f95a685c9a969e8be55a42365a7bc4e8c3049c78b97a40d084e5a3400030e3ce +size 731348 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1787636a0e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:460e5513482aa557656da08cee04dee0e758a0f82be5a37128a9f4301765a239 +size 758234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 91d4202127..4ab222d1d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa19fc1e7326ceaf8a252f962734f54a9a716ae71a0c1decbabcdc3c2e6571dc -size 638450 +oid sha256:7ab852f1f450c04251d845722315cc01870bfa790b64a994f42da07c3483f894 +size 636280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5bc71145be..b2f9f29d96 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b988c5e7af56e07f7b43aab4edaf9985ccbac0905a67df8796bfb6dc95a906ef -size 876784 +oid sha256:6b49c941ebe3464d53f00b314db0aa68ed5e40554f7f8bc9ece3069c891229c4 +size 861884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index f29622c0ed..41978491cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a44b2b886b04a73b607a733d8cb03ef7c8e292577c538d6c5ce5b9efa476b32 -size 817200 +oid sha256:c149e370d9296158d97b5d8907ad16867208eb035d9ae88a7dcd2b08b0d22ede +size 807432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5890538b55..35433ccf51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c40bccfb0b0eb754c77a272ee92450dc3a95996cc3c4b5b5fdcb337fbb803ec -size 773126 +oid sha256:78ff8f4a6deea595f04d1721ade0874a013893581c5512dab3573dbf7dbff2cc +size 765972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 47d7dced87..3d82e2fa94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17985ee8a7e9a98be0dbc202afdb731517b384971bdafe623dbad83e3c3a78d2 -size 720302 +oid sha256:d4f1d9f50cdd84bc47e4d204f67428c9f23855622e2d41db015a630bdbc44f68 +size 711916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 16d86b84e9..0c5af34050 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec87f52151582f258b03331b4296817c85cb334f6601fd043505f9b0dee9e0eb -size 864806 +oid sha256:eabeb549808aac9f0597d0006bbe167ebbf849a21ba54e27f6613bbde6d48cda +size 851190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7081d81584..cb7286ebf4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db93b810d8394eaf658b4fbcd66104e794db27ee5edd3f6b6a9a3b2bdca2e0e8 -size 752070 +oid sha256:dc8605446332a38a8dd76f684543b4cfeee745fa9b83d6b0f3bbbc9375e99cba +size 743142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index ee3cd44d5c..2887bcc398 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2dc4c016b7b7c77fd2307f96f485ac0d005fbc544d4f0c3744f87920b22ed54 -size 961696 +oid sha256:55a3381cc0913c46afa1a7dbc9faa103a56a7fc60308c13a77724bc6a819011f +size 948376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 20e55ffed9..c77cc64264 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e0d2e3f325baf6ee1b665ce12eb6ac43386bba22fcf82895549fe1e91dba25d -size 856164 +oid sha256:8eb2874721d053b528906291c10b1e45d88b54eb7bf7bbaa7d7c336d18297ed4 +size 845014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 169f732048..0006b744ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d5a9abf08578268cea033cedb593b8352ea21b7a38d2ca2a3a477ef7cf991a4 -size 751832 +oid sha256:eb860c78049868660cfb696652b2218f1a47ccc6ac5049f633541fb5a434657a +size 740140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index b340cc2b33..1dccd59275 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfe130f9e3de31c055f7098a9922ff54695bec113732f3004ccc5b39fffa2f5b -size 649310 +oid sha256:a346af985e6ecec25593a26349ec98090c4308371ea3026b5fb6cb889ad6150a +size 641514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e0594b34d1..b16843c201 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b52deacbf54a359b1a304c5642e46ca94c1ced4135680b4c6259719977c1df1d -size 807774 +oid sha256:b2eb64ac3cc10bbd69ee1ba6d75f84769e634f270ca22ae579345517093a0d67 +size 798006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 361fb4a6e4..3f7beedf54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7e19edbf416eea951f03ffa0b0236bcc04fc8789cc0ad0c1389b363482ab2d1 -size 700318 +oid sha256:fbdbf3f11c9d83590a19a0ec5c1244e11925e1bdaeb630f92b70168b29363339 +size 694348 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e2d0719336..18c6dbc6c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:072c655f877dbf2d2438ea6f9f68156f21796d5dc690269b373dc04f5b078ddc -size 896640 +oid sha256:1c7f1c3cdd1ad8a5f48a5cece7210be84d4fa1f04624167ac910977c799c283b +size 883714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cb53dd73d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89a9dc1a3945edbc4a8f9220e0a7fe26e937ef36dda4046bd9138e87b8a819b7 +size 833060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index f737ca37f2..f0a05e6af5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c9110b2a4e15170d44c21492fba62d791c98e56740d2ce9adf726f9e8a9e702 -size 793672 +oid sha256:648b202221aacc574c89af71547d7dbe1dc5e41d931a453fe2ce63ad63beffe9 +size 785532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d30077710a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccb9979a37e039a1c96caff2a97919cb200b4d951cae9b69b25b83eedd51e9db +size 737494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4b1418fcca..39f860acbe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:042b9002ec7e8594a8f1cf43ffe870c5608c2add1a9ddeb3950552a43f519d7c -size 686242 +oid sha256:e89ad27693936f377b5a66784f706095c14a83fb9575fb5307a9a616e218230a +size 707062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 95b8dd1905..8c2d992364 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40beff972cb6b88cf0f3c7af4c1f15422d0a73ee00d190ab462328fb3279cede -size 585593 +oid sha256:3a5db054b9395e87ee16405d2bdbbbc0492fa918069a3796e80cf7e3ae7f848c +size 604489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..574fc1f2a2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ba6d4608b845d1e367cfdc2ab184b940e477d30e41f0de38513d136708c46ee +size 787228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b2dd2ea646 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aadf8bada0b6799fa7ff24b6d23b1e3b8584b6b343fa61bed6538e53d5b8005 +size 684162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6b191df682 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fea525aeee5c2fba07028131074d092e22038dd59f9beb8a01568f5593ead670 +size 763894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d0a22dc69a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49c669b24c3a3f06fa0fc49276041d9a79fddb19cbe379a3762e80bfbed4181b +size 663738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6108e7cfbc..49c2cd8ae6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9837ed51f2cbe13252fc7ae9bbe79bb903c53031bedcdb8a84daee0e1087e321 -size 648550 +oid sha256:f0ad42036d877996b854ed1b5cca572bab812dcaa76bf02363a61cc002bc9d62 +size 659600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 50b2337a33..f4739f54c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:309b570ddaa1cafbf995f4c7fb71676cb7e78d946613e4970c301d793307b8bf -size 551799 +oid sha256:3960eb80f3c2950497cb0218e5a77bc92688ef4b88facc2b9dfbab3dbd4d29bc +size 561863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5d37c0f157..acb6d8b596 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d938eb83b47d9bd8b5c518eb84663118c6e445013cf96cd985e816ce6cbe8b42 -size 689330 +oid sha256:4224f381268817375fd6d897e4d80b573000bb1cdf89b7389f2a1a6673a342ac +size 680056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9897dff76f..b8262c97b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20dd33da3c5376a4fec5433a7c423b211f06d693ff0cc68555f25dee0375b82a -size 606539 +oid sha256:c447af9ff55108f9173e733b4d4ecc2302b8f0c1993e7b6e43124eaa1115f183 +size 599633 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index ecd95af83f..9d25b57c90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:571c187df15303571db41052a007ab709667d197aa9c108bc624a9ab76f94142 -size 735234 +oid sha256:9e5b2d06d3317efbef6e5b86a296f360889ae309ec289c422e4cd31b974c26c1 +size 724676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index a1530a6af3..f57eb96c44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3271fcd12bac8d4ad8f109b78eb96a88394e574b4415cd5352e4b164dadcff3d -size 650768 +oid sha256:a666678e1e5ff850a56b6cb36bc0afcd6fb646efae100f91e01d2484c952e5b1 +size 643416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 70e4f65467..def4a8aa86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f03b508b5639fd6d65b77537020a5bc935177a01ade70c1336cea0ea8a7a3bbf -size 690808 +oid sha256:339b31e343941aff4ff7aff4594524b12b94493767c56419c8372462fe14addb +size 676206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 675b3f1eea..e056c2e767 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ed1c9a09651d85273e0a4a8f5f2c3522108c60dd4b50cc1b795e0e6585de01b -size 605453 +oid sha256:1869719882d818e09df29dc38e148484fe0b3d710acfbf9166c37026a135a147 +size 598545 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 39f8a66b3a..599ee94c52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:838a6651d9543b19338e9f1c5a1b7be1a063f039300ec8c315db32193707849c -size 733406 +oid sha256:feea056cf55a614d752365144e6ada60bd62fc0506a2c997bba2194cc39b3709 +size 723934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 245eed24c3..86902dc360 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75cee4816978e54c8025dccde03270475cf57616c846db6c29cffc2482f97a3a -size 649532 +oid sha256:0c5abc18814578de211b084ce59738e4a3fa8fa09e455329e4e12658a2be4ae2 +size 642626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3e2c06e8c2..aced2708a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:236de1e9f495cc79d2ab3a5998a044724201684833cb9a5f2fb315a616a1e080 -size 758084 +oid sha256:0bc49b1ae8e2d8b9d815097d5da3c22fb8bf6384cf7d7303381f6d27835fcbb4 +size 749006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d01180cf40..5c8e75edfe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0316545094971320895ab2bf7d8a35a9a2b1344a03b0d52053a4c0742a26e334 -size 676972 +oid sha256:891c0431ac98f2de5420f45f4a028d0fdb0b834aa7648a702b250ff36d5a93f5 +size 670212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3aaab8a084..f157a8e63c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e24d5fc5dc9c714de2a1827920efa0cf02ded1dcdd88bd74597e8c8a25cbd199 -size 804184 +oid sha256:9fb9d2913e7170891e994913832b5897ff8a1099cd0171fe050397865ff1789f +size 793628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 0bb9911815..7190970834 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98c4e7bea21017e4dbd1a6f021993d48a3fb5663548fea8f2d37ab79e403bf5b -size 722136 +oid sha256:f1fd21d299bf99be1ce3ea7e7940108fa0b8afe2ab1d0ed206f8d43e9d457971 +size 714144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 65d1193444..f357061cb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:229464817a87f2724511970caa7015d2e2b9557401157538c36983ab5d53d362 -size 801104 +oid sha256:0a4c840acdbd327d6c39ebe6c25d701381e7fc7b33bb41c7fe7190aaf41f6fcf +size 785416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 46496a7009..71e865f766 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faaa18ae463d88d6fb34ad623b25426d514adcf4fc9694b438172a16c7423af8 -size 657492 +oid sha256:e3f4c4c3a832f0238c5af0dcb651c759e5f45c56b9194cea4133f92b862c4880 +size 646836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index dad8094594..7065b719a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c65a0aaece6b4628c52439ff3eacdcb4a6fbf100cd866ca69ba02b1df6905a76 -size 723550 +oid sha256:951d6c8616398eae7f2c8d6e8eaeb064730446eb5cc6498dcbbaba623fb539df +size 710526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 2e4fee8d6b..72a5520b79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8809c3f5b2c6b19ae4ddc4d99874e75c2d448150dd441d2d331abb104173392f -size 712548 +oid sha256:93392de33fd11b8717b2a27157539e6c3b8a6d9a218112fff61699a146671ec8 +size 712400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2c368826c7..a65f9af052 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8646a5ca386242195c092ac36a64b4ac01be3da52a1266b89b5f07c2615cc986 -size 631048 +oid sha256:4d3a864552cb42262502ab8bd6bc62c35a3d6f888f054d9d67d8cce65e2f55b5 +size 620984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..874d07df24 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f19272c10dcb4e3b688872f9e8345a0b810b69c2ea3523ce791a38b698415078 +size 840496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f974ac8b4f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fa8e49372de8785e90feffc8554a4e34cd1b3c63b243fcbeef8dbf38c24a170 +size 568123 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..48692c9593 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7bd363462b53fdca60ba18002dfc11df517024ca89d88ad139ead98cb91a322 +size 625894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..53e9c49f4c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5026743e4bc7cc08e705bb0c566d998b68207e7466f2f6923af2c52f2b8b742a +size 736352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b5e501d5ab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c24fcee6afc435a813c7af6564bac1ef670b3183815fd68c97a4835083dd51b7 +size 544343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 30c96e2931..b8989c5da5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f247cb881eed8bd2f53ff01d0dad413dd542be9826accfc2871410445ec372fb -size 738148 +oid sha256:ab4ce95026393b4e049d546c2bee72e910be38344879eea51aaf29fb24d0cab7 +size 726950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3da4465eaa..73607d5011 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15b650efb988975d1d6438b96e1ba7a66142b1f1fc7b29ac9c728a71a4f96e2c -size 648508 +oid sha256:ac6dcbbb8654d5c6378f4c82eee845423d6d87f8adc1709d61e58c6f9e3f6e3d +size 639382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 54c345d0f5..4c39ada440 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:687204b1d7712fee7b27bddfea6be8282eaa060f99ab77f45962efc1bc390d34 -size 710816 +oid sha256:f5d7720bb5a0712a39432e99159e57e6d90bbaa21fde1d644bf6267b601cb510 +size 696756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 7f4eef76c1..60774dcf27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6d3a60372fe44a8fbc363ce6a4c1fb94d7ba43db004a38dcafe4292a1ef4492 -size 671942 +oid sha256:d133a536f1ac5837a0e43231ffb79d89c77ec0fc4db48fe1f26346c2454d766e +size 659312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 584bf51ad8..749f27b485 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81a901b856310ff684865e89e7d5e3749031915662864f11a3221e2413eb859f -size 624332 +oid sha256:3d96811df8014453252ce7b42d4f87a847afc10989aacd74effe757a8d530142 +size 612689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6a7c66182e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861cec84c8c5d19a2e40b5d80b981809cd699bd142560ca986df0535087a1891 +size 782030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e060ab13cc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd635a87a3c7b09eacce46c405988040147af15c6fe376e633b3e1e773927cab +size 559237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bc12003bfd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647067c3cf1feb14c3e647fe0b6477c4451a937bb8d3488020c120ee8800fe78 +size 609755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3c4755c53d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd4103c4c4047f571f127e0c4e48d2c8c320b9853d4f9681717ff3f5b8145dff +size 683262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f6d88d9b4b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dafdb1620468d62e5d9a1fd38a29ae5a1cc27432814320be5623a967bdeba92 +size 536443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 71b10e4165..8290353442 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a55495750df2239425242c91571f2b7ee6c832a474bd0f4387ed1ba2ec52a2d -size 778674 +oid sha256:9b362b6a23bdf19701e61808b922452a22307bc322c65fe311f442905c6b4e68 +size 759434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index e09cfa7b4a..3a2bd1fa0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7da005e17f9dfa4ab3ef88c2aed263ffa9987fded29bdf371b5b0bca5532411b -size 743956 +oid sha256:1b9f64812b5834c8b3110b3949c9caa8e3604f9e93dec29d6256dc778ea2b435 +size 732412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 109165ceb4..26d3c6d94d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d791f8e2661c2a92c9c792c2be5b3421ce513f3768f7a83f9655550bf62c0489 -size 686364 +oid sha256:e52fa025e998883e08fe1baeb127f7c91be848a35d0c24cd31dbda811fc94c12 +size 678570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a3ca9c140d..4592a3940e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:172e0715f8d157caa633d3886053c4335cf71834ead1bbc6fff575aa61a85590 -size 654852 +oid sha256:75aa7d105a697ee40b950e10bbd7b732d9eccffcb132e4776802f616952c5649 +size 647058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e3ee142420..e696203614 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a1f701871be6b053313f3c651ad5d7495eef10fd010a10c07bd2bbbbca2369c -size 676516 +oid sha256:7a542a8df6c1ed8588352431c2e2bd369fd74c753c1f8123f1081c967fc97330 +size 664132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d085d14c89..3d552f2c40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a94a7e652a7461469212a2ba5a7edf236c88d47804e628590dee94400bd388a -size 580159 +oid sha256:7bbebf1e751634638d6fdc1bbbf6cc803d2916450270178b66c0dc93c8e2264e +size 572019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index c4ee621817..86c60fb8c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:320c46d115f463443d0fdee9d68f0367c433cadc000dd5f3242cb2afad524d64 -size 752192 +oid sha256:b3d475663bfe0a49a1ef1658b203a6c27b00cd22f73455a1a411265aeaa742f7 +size 737146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 9e1cefa6e2..c9405eed19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cc420a55e91052c251cb8e14360122ff733ed0b0725741d4bb84ed346c4ab2d -size 657514 +oid sha256:0175da9807b3445b80918e1cdcce54e9a487b5824c0d19276de5b2d043339078 +size 640100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 4a93c0b933..3cba774833 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5affdc8119c032e826c7a2a88013831fdf1ad5c48689e4c06fdbb3c945becfb -size 672124 +oid sha256:0be97c77ad5c72f8180971eb8fafc721917e53cbd8ba918326cec86c817206c8 +size 660976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f24f513072..8e3ac08702 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47d9b7b2a521e3d45e9d9e089b41610f90a30893c593dcef7b9b93a3a9c8f3ef -size 584351 +oid sha256:f737d6bdb26a332dc173247887596417a93d7f8d0f3b17be2d97ce3fdc69cee5 +size 568319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5b8305edab..0445443fe7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc83381585c11ead8dd3b412cc47bff3e6a733e71306cecd7529d104e9355d0d -size 631176 +oid sha256:30c3665e2f18914421190b67efdb24c8fb11decdc13af3095b0a120e1ea22067 +size 630436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b70e6ad360..f2d7a4b03d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:609228cb9f8bf91dad36519f6652e19030ba34959354f0ae0df52d93092add98 -size 544981 +oid sha256:3ecf7a26537bd1b263c4c4e8bc82e58cbf0ff1ea6457d6f286172f43a09f9d20 +size 538963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index fd8c52fdf1..38c9ebf72b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8974207649635339301cd2ec5dd9734cd6ff62216689608021dd568f085fea7 -size 818462 +oid sha256:c54f67c3b343202885da90c7fd95349a78c55e8ab7a4ea4f6ca536a9bc9ea42e +size 804106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6c6e9ea1ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:018952b36a366ee8f3db89cf678fcc55c46d36759e6aec2c321c8ca14a7e4b04 +size 780486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index aaf14e7ad1..33e7e3b957 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f8ca56599d3a602c5f8691669e21ef695301228252b4cc964b253a29fc50c93 -size 730542 +oid sha256:d70265a7399ad99350d5cda82895fab8516b323d334c1309f0aad716206d0064 +size 722302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5606eb8e0a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5d9709f375969e5696d6278ccb376d39d79d39ebc553adfeec6b4cfdbf0c99f +size 703766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..be06b391fe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5aa9b16848df928184d7bc7e4e1952d29a5c81d9f1cccda4ccefd47fd486df +size 587837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..92e7b5fc67 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:260795542e9f001630e70b7231eba3d07fe7fd5fb573e13220adffececebf1e2 +size 497253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e6829d489b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6486526cc7d6eedeb21b2dc3768c13ab8772e823c98fcef249512e2787a54a6 +size 661492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6be3478194 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81e6502b91490f32d9212ad407afe7983c08b89d8060e0421336785ee51a1e3e +size 557983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..26fbab0c27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e34f391cb197815c906e473d10e14e77afffb93c692e8debc6c467b12d700fe4 +size 684334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..01b6127859 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6970c6afa87f7592619b04acd5aa47bd09c4d17c7acd9e838f5396f69c821a5 +size 600063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ded2f29c73 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb9243907b81288e05370b2d58f8ca38f9391aea216f37252db48083db7f840 +size 559321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..279e50573c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3941f2f5e30bcc3f3856b15c3dcc9cf808e25af7cc1a1147128c1d01601ea0d7 +size 467009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index fafd2d5ab7..554ad3a141 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27dabf22d219dd5d386c77b1a883951a479a2293a3dd49b8ed95684a7be62b42 -size 796220 +oid sha256:ae7e4c947cc425ebf90493d1ace52eb160f829a60d63bd97b939177f89409151 +size 782160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6850410f6a..a30099ea62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f131f5b0c96a9bbcbf61d2665b59410642ed68533276ea0fe88c6f9f43688c14 -size 838918 +oid sha256:e15f3440198e1d5aa33c49729de96614a92c10160dc8369373623638a68b44f2 +size 829594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 76015620a1..16498d7b00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d797429375984bf0c83f0722018c1323be76bc4674700d2bd5d16182dab6f7d -size 735188 +oid sha256:ec3e15bf3829434ed19bc07ea1aa8d9fea618757988c0fbd748d6926820d1a09 +size 725568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7d9e48ba92..ed219b7fa5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:441b23f00ce24d8c147f8cf4790a7a69d4f64e9623b616778c5545dcb7fd4677 -size 779218 +oid sha256:a38eb0ffac1b1731348851925932dd4072558cb3cbcfe72ed5280c48c7764f80 +size 773446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index fcb189d5cb..c0c815b443 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd6d5f16fa8de1b011062f86301dc145a095865dfe3d46980972caece07fba1f -size 726690 +oid sha256:f46190be4d9bf8b3ab05304a56d17a6a58b7c314d20c48ea322d443aa547000f +size 717858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 0e55d02bfc..762d4ee182 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c9852e393a14df42a38f39409c7d80a389beb54be155a19e4bbaa6986bb69d3 -size 642666 +oid sha256:d86a3f18fe61c45ad7167e928c6993c6cd162296d879b7eb078546ee94665ec1 +size 633786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 5328ee662b..cbe7c3207b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a9e5190852081dde0e9271e7083648c999c02603ecdf76999ede36d39cad152 -size 773284 +oid sha256:028f44802034fb80a2de9182be88172083b33468c23b0b458ee6de1dcaf6fd0f +size 766082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4d20fae183..5a216c9db0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6e3fa68efbcdbe260bbd308d6b8c50f4b11f511c9cfb4bf1c6bf87ab62b8577 -size 685956 +oid sha256:18303c48faf0752dde47f8b62dc67927da056c47c373213b0e1631fe714c4821 +size 680874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 7783c337ee..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea41141c2a8e1b47ebcbf69eb51b2c1cbdf3c1e318115f1a553023c868b9b0fe -size 566741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 6625bf3ac7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10858836352603b23e0cc51acab0aa131b41bb6ebaf144b196fdae8a60973fba -size 544243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index adb9561b85..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:045e3d27ec20752981dcf2a49ea1f683ce3cd01513cb8b69465f867461528494 -size 557607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index aea5f5b47e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:91a99798780abd362cf078c002b03a051f06c89eec2a91fb468383a36e92b945 -size 536393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2bd6cd0960..5947e6853b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2eb356f76d3822012050305e6b44cf0b5a89ea85ca31128d6ed8b7fd415cdb8 -size 769496 +oid sha256:dda25cea5e3ddd14980f890f354cfcf2a87d42b133317ea394343c42ba79781d +size 756966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1c5c5d83da..71180f2cb7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc850e83b9cfb0c3051eeed845c69bd8ef83c7ae056d4a25bd92a2ad2e1f2929 -size 685672 +oid sha256:0847a72bac4b194157f3868da31267d4f40fb61d824d5de4b56ee58f89adf6fc +size 677384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e966ad247e..e02f3ebe88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d998de74acc9514019d876ee11581a2ed6c1b72e84114a13594fc097a894e218 -size 815796 +oid sha256:86d022641ef27883effb8ae26e292456a45142c9261a7ca6d31062d3518534a7 +size 801636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 1c8bf29872..fca34ff4ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:188e72d72fb83ba058b04e9784912738e88aab941fa10f4aeefbb039bc4419d7 -size 730046 +oid sha256:4c0fb7e00d355f18c3caaa004a524c19da53e2157e3ac411ea9bc5ff326a415b +size 720920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index cf884a96c3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e003312edde635c20f5b881b1c302d4ab4e41532df61a425f13a394f7fc2295c -size 585517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index a87a9644b4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4cdf1a3b739181ea0f4ede4bf83b4a078793b081f4a67d30d6862ea18b2e060e -size 484129 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp deleted file mode 100644 index 61fd4f403a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:027a83192e9c982ed8663b225944509d8ba3235fb85abbeb7fce5a36c52d41e5 -size 558579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp deleted file mode 100644 index 7d8049ee5f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:76df039b8f3c99174f0ba214e1fb5e8f37863ccc334570769306b935bf4e5e17 -size 461187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a3323ec518..93a50512ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fb060b7c2060388b6a6184f278831e8f16659fa17043a33f5ddf5c27be84e2f -size 851752 +oid sha256:cd60df403dd1258502d20cebbccc3e8c98647a9d8b361899dc64e4e5656c953c +size 836458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fc627e721d..1582038213 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:961a616ef174151339bde0d02d681885630e04a91d726c86632119aead62092a -size 687174 +oid sha256:4a386a45049f58fec9b54f50ed41a769286fae0aadc4b1aba79758e4ced32a38 +size 677060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 76784d05dd..6a476f868d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5821045a2bf4e0954c78df827a24ef9633b50134e5298c2d76a8ff1c079f8101 -size 757424 +oid sha256:4c601fe6fd522f258663b904260acd956f35d74f9f4effaaf859365f463f1daf +size 747508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 2d3480f680..ee6c4f5bfe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0c4b6c9e3836093c1ac1afb745685f98da1541b14cbb68be06731bfd17297c6 -size 748594 +oid sha256:7a9eb8d064de6e7370cc90c2a305c7942ba2b9742983fe532ee49d107e8f6b9d +size 752688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 56cfd27d00..fe9fa534a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c99f23e710623c69c30bbedbc751f165020bb0402f703c7ba601ac3df68ecafb -size 657522 +oid sha256:27f6e4b6b89ba4ce7934af243d14f73aa4919577a4c38d37887e88ab20c9b309 +size 648988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..71c50d6943 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27ef1d3cb563abf9a1aa341988087d9ef18b4941d2f6265741744c105af3fdbc +size 883842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 89abe67f7c..5c1f76618e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7fc7088d27d49c83e8693fc0991764564c9d770d403cac2dda247b61a39aa14 -size 593019 +oid sha256:f4917a13c0d1eb722ee64cf57a6e20b4b782fac74431cbd80fa8311e447ebd59 +size 595979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..575d6e7d09 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6511161ee35e6d24bebd12fddb4a94dae333915ca68e828e64a07a9374c92f3f +size 658486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..483058a198 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0edab6d3323c44dbc74a274228d7189c630631ad316b2d27f86ff1585f60c307 +size 776590 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index c71d182b72..6b340f61bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cbae8f637b4e31a4cff8eee14d15a502c1474c9b5a5f74b0539228413756c16 -size 569781 +oid sha256:768133178ea65ab8008f55574015dee1092f2ad949d7ed0955b1ef774f9b178f +size 570769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ec3369b08f..9df20ddb05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22dfa6c634c6d07a3ab1f890dede49d05f9c435f1192bba3f88e0817d265fb4b -size 788796 +oid sha256:16e5708d21cf5624197cdd7e149b0c56d2e320d9b1e0d07d2e244b3a5b509006 +size 777204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 41d8872351..7c6c3992fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6d29b8afdcea4c51f5c7b562f25e9d6b7016bd8f786d92aa1e7fd9aa11cbb9e -size 678140 +oid sha256:8c62d03d88b328ee5e51f471998f6d25643b5e29a1cade61a9295d0bac0aa5fe +size 668816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index 4e3413557e..dfecf18c35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2abf0fa60a424012224069598e66484116e4c9867290c86f6ae9eb4582907159 -size 744690 +oid sha256:b38246d2ad8b1c721d06aa0994a54965315c9faeacc56b1fcb48488799c44ce3 +size 731962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 12abd0a3a8..5adc113c34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e98ed870ad5066219727d9c0727e286a605b1525b7282990fdc8db791acf678c -size 711588 +oid sha256:753ac5684d59b260dcaf500fcc5f8d98dbecbec4979097f543d4d16fb31745f6 +size 700390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d1eecdbcad..550712dcd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0f4ba4b4a0d20b655420e3b67b67619d873355cc96a644e63edb56e25c87bf2 -size 650808 +oid sha256:204e7041203995fdb0f0c8a31176d29198185b7b38d1712b9f466d2c9baa74c4 +size 641532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5fcf7e75cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:276f2d9c6054b26bb2da7b401521151c4b6aa5d990dedcc3e198694973f19896 +size 824538 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index e5f5407c36..59f3735e6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d4786d82e02f09e4ede4d031854805818b4e2c56e1ecf19b9563b2280004f26 -size 583097 +oid sha256:7300b16026d11ffd887d5085cc7731eda1ae431383c5af1d36ad824ffe894daf +size 587093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0398a8e2aa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c890cfc57c5014198dd90f2b0346a9bf3a32a1750850ffb8b7684584a0b6ff5 +size 641608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..42e26e6e85 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5492fecbabd74799c5d49c02554fe84607327e23d889810d56aad7febdc5a001 +size 723502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5864d0b164..0271260eef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc3ca5d0865e84ebb250aa8df6551c3daa232b6259d0aacfed20439a4433c0c9 -size 561043 +oid sha256:b5f854fdcd95f9293e6f0dc90cf4aed8de6f57615bcf796929e02d82cda77d05 +size 562129 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 69410c31e4..765a0d1134 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc12315a00495fc230e70c84d6bdfbd734486f6ac15485b134fde9dc88c8cb19 -size 849894 +oid sha256:197ad9cd55c1451b3685b197ae36d16705d6fefd1ffdad7ffdfdcfc7987c8fd6 +size 837166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 7d34f2a1c8..9404b66d91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e22fcc187df972c9ac51b79f741a5c4684d31bffc67d6a9bfa3331114c0f620 -size 793766 +oid sha256:601e82ce434783679721bc49c92be5d71dc840e668f61e988e2660fc2d5373b3 +size 780050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1a6c32eb00..2087a2d096 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70d0d562310d3ea0aac98e3e7a327160d9a4b1346015f6ed89aa941ff319a5a9 -size 757238 +oid sha256:cffb0880e93d6a2c7a5111a8c33201f8355d61782a8ac5ff76f3fec8693dcd34 +size 750430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e560ecab4c..80f35fc628 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48b2f63aced13c276f90f9457aedf86aa3cca000d284afe642ea74e52d7f699f -size 705994 +oid sha256:bf32ae809874b3fc363e1fa5823566aadbe82242ee9d6320ac30cf2f3dac22be +size 697754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 838f9e414e..8e09a7ca15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76e9cdbaf34b1817563bb9c39e0a779536ed2a25df07aa8bbeaa61e68cf11ab2 -size 705852 +oid sha256:b41e68d4b950bac5258d5b80dd5cdd0547db1c60bafabef047bc2b20675e4b03 +size 692186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7357765894..3b5126d5c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ed56e708b5b437f272b9c945c8f26533a858632d54244ff7981462656882d67 -size 607817 +oid sha256:9905728d7bac14451f28db90ac3c8b176c6a8119304df76ee77c1cb667f31037 +size 598739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 600f4819a3..6030f4f8a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70c4c0e2eea823dc591fa5748820b4fdf615d35986bb035a0664cbf3cd58cd41 -size 782960 +oid sha256:3a1e774cc1585fcbdad394c5700bef770920bc142b9cf5d776b87b7710f1ca43 +size 770084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp index c64fd0a6bd..f915484a60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53158af8de90a7e9a148c7eba79dc1826c81e3fcffd1d74054371440f3ec9208 -size 689120 +oid sha256:f25746d48e4a2bba5b4041fcc806d0dc4388ee8277787b5c75f1010c384ebab6 +size 671014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 57671c50be..e29493a3e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:213eabe11ba6876de51237e3880731b370f28f6c78f79db4b450e12fb23eee1a -size 709650 +oid sha256:db1985199cf1df561e4e50c9144e034112fcf19410ba58a720bfe98147618e99 +size 696528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9e1b605fcc..f10558c038 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d5b730d5810a66b47aaf423b86d65e9f53e3693a909661e449003117b515888 -size 610383 +oid sha256:e296bec0151f81c732251ca7513bc34658e21870f6ddcb1cd36034ef69131fb8 +size 603229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 92755dd1ed..8f909e76d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0b700849a6436417b9b61ae662154936298b37de123d05fb89e4de0c84e873f -size 665446 +oid sha256:bdc6206fa81e934d929c7da93723b3fa784274901bc42950ff4c117ea535e523 +size 656418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3c747778a2..352fcc1225 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:340f462e94ed72a5db68bca7aae0de28e8455a5dee5cac375876a7c4a281b226 -size 569433 +oid sha256:0f4b5858d7009f4ef3f3608740d1d5e7b5206f237ca598abe2a39b44a7c2c339 +size 561935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 0074117b28..02fe433a17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:641e2e17cf92577b995d8f6aa44ee96e7ed8dd3a4096a70c1278a59c4c5d50de -size 889928 +oid sha256:19bb754c08d5ec520db1a6cdd1be7f899054348596c548c4874881dd01713e49 +size 877200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..27ea23c10c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ccc9ef9d6d4fcbe433bc915efb1635e04412b51a8fe6001e1997fc89d6495d0 +size 828076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 0fbdee42cc..6c32038509 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6034144a58edb73cb0ebb0b8af0e1e9cb9f3013f16740bee8201c65bb8bcaa3c -size 803192 +oid sha256:d413394fb830f9299cce22bb3b5f8bb1aeff7d687757268781d03167bbdc478d +size 795102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2554c34934 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67294f0864ee60e6534fa3ae3c7da552b98ef82f73683a79622adb7e722d89e0 +size 744004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9b1f5d48e4..abf144e153 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb9331dddd1a140ac1e513df0a025a2e41fafb62f15f18e50bd6c8c79634045a -size 610315 +oid sha256:a90e58537ba6e8e7dc84cc55eb4d991eb5fa3e8486f842508fcc41bcd5983ca1 +size 626054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2202575d7c..7a84c8ac24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b7a41a8c4fbd3119098e9e36a6ef3e89396aa021092051505cb1fbe454ea60b -size 507843 +oid sha256:f723f3e0cea38e57635ca2bdcfb5c447c3f80ecf8d11d5d8f4283a55a98d5430 +size 525751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..920c1aae68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12b8303348f9ebd8632ee57e71f9d3af45f9aa1c510b178b88b2f6e1312defb0 +size 695762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ea0b39a2c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a110668a36a5c195c6853626624e60e08556ecac94e6f8886534aae10f9d9446 +size 591363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..af84c10218 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b8fb91306dad7d32c8fbf085ce83d41d2af2a8537e8a8a962e827ad2236a606 +size 721464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..632e7cc5ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe12771124df8092a8963c372079b8307596b871758d41a35ffdf4343a0065a +size 636850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 603b3cecb2..56217fdb3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7dac587fd00757f73c8664c322080f12726a41aacad442653e5e68112836a89d -size 582145 +oid sha256:4b6945dd34ca99580f8e62013b1878d48c1eae9c147f134494e398bdeb916774 +size 592997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index dfb96e38a4..24ad933e2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d034aec454bd63081de8b8517f8b566404302c57f043f46874dfbf3bda92c1f1 -size 483321 +oid sha256:04f10548065380667da8c4e3308e3f88f1082d980e66a5382b5025680f461295 +size 493337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1097f6f042..66d4717904 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e9794760c8a6921a10f990e2582ea5c959960d3769cd7bc875faaec0c0f362c -size 636826 +oid sha256:9be06455d1b4259ceb17f813424a58fd1369a7e0d59242364fde27e2848ea80d +size 623778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 9320346e0b..6534f591c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e183dfcc9629c6d8c76781982cfa37e86f7cdab6f11793026dc8f59998760e31 -size 567135 +oid sha256:41dca6dfc7bd560bb424a30302e04ad19e92ac1cd9d47291dec5b38a26d159c8 +size 560893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 775c6858b7..ca6c4c3113 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca3197ca8ca2fd3cbcbf7e93765d2a14b2273d5bdf5b03ea948719ef48a04958 -size 486041 +oid sha256:557411e4ebdc4c0811999db58b3f83614a31b14b6bb20d608d75a3a69742263b +size 479801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bc57aa306b..4cebac3897 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41b23aba4a683f84bca743f15edfb14246704385d1bf0bfbf1ebbcb29361877e -size 455527 +oid sha256:bc3ac86d2be810c76dcf7ada6d8e3ff7a1b504a15f50e1f89b8257bcf52629cb +size 449311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0bf8dd6db6..199d126b27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:268c4696907abe399bae64a082daaf5c2ff82f11848d4adaca4f581047e87b9a -size 475305 +oid sha256:e672dd2cb806ee6c4f5073ceec8eb781dcf0ea468d52c95fe9864aa351cc10a5 +size 466721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c1e433f376..75c84129a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:298f31a836f5962fcf594ad136ae71663f31d9b9b2f9fbe47a4f290d21f2fd66 -size 451131 +oid sha256:52083079a0964399c9a8043fd0b30fdc48bbff350f3b2411c22a286aedea7cde +size 444125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 68183f8753..515e189466 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d83c9ba3053d39b8f9085e2d841e52ed7132a668fe9abf6517ff8446c82f73f -size 634160 +oid sha256:d6cbdac39bf30fe5f0a945f40b305d06b8d73679b655adb54617da798beebb9e +size 626144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 295ae4ae69..ebd6dbb56a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d1d2aac92ac6bd39e656e7f4d8f5951d1fcf4454ca26dae282ed5237ac1a67f -size 569549 +oid sha256:ce958ccbdd3c2752835b8bb47e027a85102eff1133ade025224504e195fd0663 +size 563359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3d1bf2785c..40d44cf7ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce9bb5d6bd4f237af49d10262dd0c8cbbc0e4fa8346b6372734e52534f9aacc4 -size 455903 +oid sha256:3c391b9d2402c1c1965aa146f91cbeb3d06bdeea6eee7d7d1a5d05d7e119311b +size 456791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ec0be29be1..0d32c9622e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fe86f5e67a5f4b5126e78260eb0cd36c2562d5830bb948bf548968c4b83660d -size 397139 +oid sha256:78bdb522fb7c88f3fd4392792f1309bbfd9078a48a1faa61781718d34f6eaed1 +size 390923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0ba289ed68..d22ca9a5a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5256296b50066bbb4370480005c39472c0be07ed9af1dbab349199ba5f48d0d -size 439621 +oid sha256:7ab4197c9347c20196fe4735152c7a6c6aab389d135f978041a9e7c9f1d5392f +size 439721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ba895da752..d635a15faf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8352838266dbb462c5a8143672b4c769fdc4c97b848ca8ec7555b86c05a681d6 -size 380857 +oid sha256:378abe93bd82adb2eb8376599e76e0a57c3caa442913552b94bbce1d72adff8d +size 374641 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0d497d1905..57a6f833d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3f217702d26c68da7e6d76ec0386ae19a84195e35d339c21376c6ac16df0f04 -size 508177 +oid sha256:fa09566514f04dd9a5cbf2a5abf2b711833d11b9fa422f9c88f7567dd8aca1dd +size 501937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 868908a19a..253293db57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef041dc47da5380b44fc9a8850f48dd3a66d93ab91d85aea253f45c5881cd358 -size 472137 +oid sha256:d3c8df73aa212bdc53d3b95973e91afed5dec0e37f0f08f70b6c35dc69b29c69 +size 465921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 261a010747..87b8e86b9e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e79f1ec1c1d2cde50ced951c90d5ef309c46d299b2de5cc3461c2e30d5ff8a04 -size 497441 +oid sha256:f020d3ba8907b5f027d211ae2543a104d81c4ec67244ba302a387e5b084a9df1 +size 488857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3d7985b2b1..2231a5f344 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5804ca9380486bcefbe4277a4f4bc5493f4ab3a8a557f67e6d5bad4727fe2252 -size 472477 +oid sha256:c2b6930b2a887e71f7b0b207b016ccabb03bbf1a792c7f99840968ad506aabcc +size 465471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6216e7772e..032503c396 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ed49ebc12efe865e0ed9d3017188b12e02d157294b76774eba716257b0a6299 -size 677214 +oid sha256:9d647ade4dad7b4c75e384d5765881beb84c94dc58741ff5d4f3b09aec561c70 +size 665744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 242abd6043..be9b40e8dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02aeef79f476cf006c2bcee58e95f7856d7828fde9fa8f982e99ec562549f105 -size 608951 +oid sha256:f175d28fdddbbcf4873d80ae6692217ade21240c8ba5a0618daced4a1c9fedad +size 603501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 32f4c3af64..95a36270a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cc6051ba0a201df41eec73be31c864c9d1f8b3d2c85bf71b215075fce6e3323 -size 484255 +oid sha256:1e76b4e496b2e490713284e272982d6517b3d5927b88631220968657b6bede2a +size 476461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f6374098d7..260e219328 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e250095162b8b82be5f61a4c97ed4f41c9c6eb2b37437351c46af5c650297898 -size 414415 +oid sha256:7da54e646bc62d8c612cc48c4ae10b4578bb78282201f8b4b762994efbf136c6 +size 408989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 862753af07..46fcd0a799 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c06e52a73680dde5c8c6397591d2de9d8b3474d501386451df43d2b5611d37a0 -size 464027 +oid sha256:1414637b579cb3a8060a5d7d7cf648b9d77341d459192a7fea983752ee6241d1 +size 457811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a1067136b9..245eb6762f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9bd286b9ecbf7b7a775f8666afacd4d907996a9930d31a6f5483a67da1622c21 -size 397345 +oid sha256:f5dedc1acdda5978be03354a719067a4a4f6b02fec5e6875dcf75c18cda73f2e +size 391917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9fabaa5745..863a11b1af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3aa6737def24d101c9bf3db4622342c20586b033cd23f94d1b0c36038499e227 -size 667562 +oid sha256:306763da17c7bc674cabb394f928fd5458646f0474e71bccbafe2f9ec1411055 +size 659766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6c3557b463..930902c87b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa972e367859aecacb3701a31431353a63d2f979d3d2611441c52427054a6ae4 -size 575817 +oid sha256:444282756edbe657105f3ab555bc2241fe454b242b62e6d40f82eebc7f1755a8 +size 569601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d296f546fd..17d5195469 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09b2987882a2fb8dd2cb033af1b20c9134c4b615c3a46c7654a327e24a6e3317 -size 598917 +oid sha256:14de8a27e9cde7f811128fd98ba5aed0043687f6298e9a6517cb7470b5a95153 +size 592675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 175871794a..e9a02784a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ad2cba12b625206451282000fae7654e8b82aee0b02e0a8e65dcaa9221dac7b -size 573927 +oid sha256:f21d05d61c65651f4a96e758f76f8a855eaa038dc0681de45f20fb3725be9f68 +size 568501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2560c774a7..0ba3ecc16e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90df126b12b1cfd15b2865d5aad35be9f3a9ad6438c8855fb73197ce19fb25e7 -size 585023 +oid sha256:e2e5bd3410395b5b9863965f945e03992ec19e2d2455ebf6bb177555ebae7917 +size 577229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3531cf03cf..3ea88ce01c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bed40be9bea093bebde08f73127500eee644cbe2cdd01626c4ab89bc048d9bd -size 565583 +oid sha256:88a86061813fbac7d177f3c96384aeb27aa814a6ab3b2f3e0a736507914ff379 +size 557789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ce27c24c3a..c5feae4824 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:304ca95564a2c31574fc1f7465a8d3a56059b92a494e6727d7791b7d9dcdaefa -size 669928 +oid sha256:5a766ee559a923e1d8f34238230ae7dc59120a6e8baa9e3e5bc6ee08fafc1993 +size 660554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1ff95a4208..7b71337c4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c75028687c23b121e00c35eed5b24259befd6524b8dbc75b4f577af75800b0e -size 610669 +oid sha256:ca110e7e2234836fa533153cd12749fe76624d92ae12c866abcad3ac0af89f4c +size 602875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index fe7c284092..eed53cf49a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8252d588aff54843371d5cdee86adeae0d8da62f60550d6309340dc56116e93 -size 564043 +oid sha256:f5c45ff510b077005bfafb091c26adcdc236d56deaab6d21b6f73a43e226632a +size 564931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9cd3310439..49777a6efe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b8f8d2e58c91f8df6c22d442e3dcbc8a220f08996560d2043f117f9dc9879f9 -size 506067 +oid sha256:463320b74b0f7a0c048f09b8db45a69e3780f1bdd43fced1512268863a863fb7 +size 498273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e96ee36efb..2369ac59df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eefee97fe2192d6c814e9f6ebab13abb9032bb9372d29ad8dbc48fc713ddb9ee -size 545393 +oid sha256:7b6b5f6ef4656fb5bd0f89540ae4ecba8e02472d5b91eeea0c747e8b3a42cb2b +size 547069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1c9727d0c9..275249af89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ede1df536d680b33e9bea49df9df02987580c184f2c07dc78632ed17228022f3 -size 488997 +oid sha256:b17c417cfb27c1057a7c03b3d3e9a980ac981919536c146ec3855e67b98a7799 +size 480413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index dcce4da284..1d8038d7b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:664d478a1a5f6fb27118112eda0662ec1f69abdae281f5f9dbf80da2335b1964 -size 621842 +oid sha256:e3cccb5c57f3be100f38e446f98e5794646d0d5eb153a861c984cf90c523161c +size 616389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eb02e03434..d75816a9dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e0806b817187029541999b65641bb52398cf50b4c4c1fa0e80f346f4a74698c -size 596063 +oid sha256:a94e7318b36a8647493a713a369c2479029ee7c3e0e5dcec587fa2c0b4513a86 +size 591425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 07730bbaab..1252a3f398 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e51a6439b1898cb0a260ec4d535804779e57af4faf15aac63071dfac6b95ccd3 -size 608737 +oid sha256:bcfe48b9ab96d7673ff5d8ac96d324bed0ce4d6078806fa566e36c55bf5360c8 +size 600943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3354c6c0d2..327f54b54f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abf8aaaacd511af38dac137b1004946788e12a83ce1c2072480cfb124463592d -size 587719 +oid sha256:e360a05125ff7235ccb07abd097e44aea188ee18551f78836f5295a0df8d73e6 +size 580713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 520d55c805..c18a9a1ec2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac5d9667703f0bfc6ac965e6a96f363764529ec8a62166ca9af7a7a27a4ef1e2 -size 707948 +oid sha256:4a051adcdfbd3f00b6b6c40d7016ed980ea8842af57dff3d1a486e6042c5d671 +size 700154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index def31befce..529e0e736f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdd7543a42485cc53a1f337abb1042c33f8aa84f09db52c796549827a02ec938 -size 616993 +oid sha256:cb9a5be071023429165a16d252b0fca4f6dc38c17bf3739ef1d3bdef20da31f8 +size 611567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b22e1e656a..0f8ae34cd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a8be870b93e47221492ac9f5170630371ecd1d8655f74b26e5781b7f228aa79 -size 591605 +oid sha256:e58d62bc10fd815a338bef726a9caaad246a342124a48350a5b626da540f4853 +size 584599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8d0fd13bc6..713c128d36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b98415dddfc581f0d35166b13ef7ad7e5e26e2577ac207cd432baecf03429db -size 518607 +oid sha256:6d67dd8f07a922ce016a7c237c5fafa800458e3edc7a632afcf2ec6f312d3037 +size 512391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a9c6e85d4a..85ddb3948d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7149be6d6e6d2dd7bc07ad0b840aafe2c3d97d55bfc6df950678680b695c605 -size 569797 +oid sha256:a5dd22fdc77a34f8159a69352a08b00702c68a218dd8c41b2f6e423613d95c8f +size 564371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 635c21ef92..792b00761a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1a5c6cf6472ab810e7a4fd3f3edd62a46ac46c9ff4b8d86cfa268ea79d4699b -size 499957 +oid sha256:897bc8d9f634368bd7acb56daa71de3edd6d2b30ce4927b296620ed7c629495b +size 494531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 31147f18df..984cf97a48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8425b56058f9820e6d6b0a55cc52149ed7420716b7241e4bfd367f270d75d11f -size 564427 +oid sha256:7c9f3832c05c7136877180bbaf1367db9fba7394d953846bd7c60f86d7fc6ab8 +size 555127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 21fbb26b0b..f906c0e870 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7797645c990b3ac64a2cb41f2e3e82d9ed6359aeb041c3eec4f808a25779caa9 -size 505687 +oid sha256:35d27b9546c749f4afcd84b13f3c8acd7b829ad6f76eaff55ea3aa688793a841 +size 501173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2eadeb06d2..8e92a98377 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8b847197753895b5129b668a234c41ce36d488d0894768eb4dcec1ec7c2d3bf -size 459425 +oid sha256:ffad820c1e395b476e9f3481440a5d4effb2bbf9bb20f7d0843f4cf4f85a4838 +size 453159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e0f9bd86d8..cf8c160d9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0c4f2d9e8c72df82bf0fafca295ab3e9272043b2e02550e316cc5c175540d32 -size 445263 +oid sha256:957a247e7243797c6a32e6c869207c258a039c47ee870eb2bea1eea8309168d0 +size 439837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 82bf86e959..709fbfcf7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0adb4945626d0ef8a7f710b06b54778473bdfef4ffdd5d905e0ff93f16820fcc -size 454189 +oid sha256:2474c89bcf2032bdcd16e48e5c6996416bf33168ed4ce40c817c6f8270f0118a +size 447973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b3b5f45bc3..763cf5a675 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd0409aed7fd8e41ac9fb6219f6b4dd774778594e896053c8d339f3cf201ebb4 -size 439289 +oid sha256:777701ef6843f25bc047d4a9d66b15559bb2bae156dbfc49b27726d6b5690ce7 +size 433073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 963b913f0f..011bc4cf1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2172c538da679dde5c7a9ca49547bb5fe4762cd2f100f22014009f443f190dd6 -size 563635 +oid sha256:fd38bcf0adc74360525832486205b40f5566ca85fcea0d0ea2a5573ee63c3126 +size 557493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1cd8dffc88..bef2c4a378 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac6f0df9af870e41cc46d9ab1ca6f5d06469da5e9a12bc80810ce1e14a92fc0d -size 508103 +oid sha256:20bd0cbd5c547e8dc56eefe06e0636837a6300e77b75a03a7b0fb9d8840cbaa2 +size 502849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2dd0bf34c6..d8fc465bda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5962dc23e996c56a794c53f27ff734015933bd584d7f669f9c7cbcd33950b7a2 -size 444259 +oid sha256:2925b33db122ce8d9d75662a2dc230712e8840b06df9f6c28641bc78fdf408c3 +size 445937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c99ed088b8..0872d7d3b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:425b70c17a2337ee6359621e818e0fb9a49554d73c408b16d3f44d7eff2852d8 -size 387073 +oid sha256:9881055c1ba661f6e36ba882242bf6842c41e1888f6eed2eea6e5244ec6bdcc1 +size 380857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 37e400d6f0..8af3910e21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72d90ec6bb9c64793359b7462e9f46c7349b5a341102e3503ad6ea6b17c6bc48 -size 427779 +oid sha256:5842f13bb4c03bf12ca91dce31ea314ac162ed5d326839576c57f227f25259e0 +size 428667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index abe41cbbaf..c81fabfcc0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0f9f4bf1ad3609d9316b91b087acfdcd5b6db23a9a7531495fe4881e3b8f615 -size 369805 +oid sha256:05457d7f3ba3d47b96d52eab2146e60edcf8e31b335de74201c6e54adbcfe9a5 +size 364379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 53f2b2006a..92f9a75b4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8802c9eaa69cae82b0ee62ab242f88e8f532af937ab09445f6ffe0ad61af314b -size 476775 +oid sha256:ab6f4aa5171b09dc59496b7f5bac3262b29c0ef73ee99925e29bb69234b6ede1 +size 470559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9c9b90394c..3b8576d7e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9fbdbbf75f4d6291f75f1f2ec2e0e7846ebbd34b7dabc689f2d7deefc61128d6 -size 461873 +oid sha256:5df4c3f362d142f31f46a40a9fae29664cdce8333991a1d993ee27dc031fa241 +size 455657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 934f2f548e..17bc4e3562 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41a0f2b399b347bb778362c8729d153187039d5e2816e1dab466908be741abcd -size 476325 +oid sha256:ef5d340ebf8ea533e2c797d7e1fc58de90653a7cb87ae73092f752b6656ae17a +size 470109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6103cc9caa..0eedf809ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26c2f0ccfe9b306296da4b7fcbffb99ad285e940ae5a7c17d612df1bbee2185c -size 462213 +oid sha256:362a2c286af6e4fb30d3e99ac0bbeee16e301182ffa97a69f38710277d0b86ee +size 455207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 98cc12fb0b..9053f98c20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a3e7e8c5fb5f5dca5cfb672d8b90a52da54c4b8917781fb0bc517dcfd9c2f2c -size 604813 +oid sha256:8f850d4ec07e90fbe9a1696316aa3fb9f252270245fce050fc60e6a56888391f +size 596303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3e72820d6f..013a35a9cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00092e4c90ec8dbccaf62726ba6a2a85f122a24f46d797700b62910a13ff52d0 -size 546075 +oid sha256:5864807b33b81cccc65b39adfdc732ef068d1dae3edcc2d8b630df5340257666 +size 540771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 100e636643..cd958a4b8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1a09cfe5c830f03b0ed5c2fbc3f7b24fd466db4a89cf1e3570b071b984f4a08 -size 472611 +oid sha256:bb56cf1de9d77fb587c84532c1add5e54a6d48bcecfc5bdaea752b0e77e97698 +size 465605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 00eee6231c..d69d410279 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bbdc34fe828b861e936b90c2b10c44b70e6cbafe15f5b4aa11a7f20756dae231 -size 403561 +oid sha256:9d57b0b56ac59fd49edc5278aa5f9d775f71f06f55bfcce406547cdde9366607 +size 398133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b98b511b72..36e88b0996 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b170bfd6579ebe430fed71f5a41785d869dd9163e4e9cc9dc5d2a97ee7584372 -size 452185 +oid sha256:8790d6fbb9729c9efbc963a1cbf423fe6cdf98d1c931612b903bd2de22d6219c +size 445969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 179c2c4eeb..b7f898ffa5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d2519f6a470fa4cdeb6787309ae699d42327bcce7d23f53aaf5e7b0c9d96207 -size 386291 +oid sha256:18a6aac567231478d83652776efbcf6def7cc5256936635fc07820412ad17acf +size 380865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index de19be3c74..0b2b87db36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:360daec624d9ad02e8b0b0091c75c9c3c0f5c3d9c29def9c95a48c85a3f7456f -size 726842 +oid sha256:786baf07b09c12ee76c31aad1cbb0d033904124262a05216e3ae8adcbe82a829 +size 712880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e5c834a943..54db74a051 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be0b212cb04d790b455157e0f169392bb71990ac930edcc571ddbb7e5c7bdd68 -size 640550 +oid sha256:ee7efe7d12e225b97ecf288bf2535702638dafe1e1dc80bd2f90def95fe12239 +size 632706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 29b3d9e89d..89c649b1fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c43d61d40e2b3db89e889744861874b756e874036172e9baccb4d634a53d457 -size 754640 +oid sha256:f1b51a193bed94bb139cb224488f23c6d31c3c18141e79d4ed05e483a53e5c14 +size 739494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 51061f42c1..9faf9848f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d7456979529115dfa2242498a3e595da06212074cefab95b4c03385fb295a19 -size 666918 +oid sha256:65e4e416f62ebfd147e22bf2e6f3a72ca3139ca217730abcfe22d25b182a7369 +size 658876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index a9db3e333d..7bf55ecce4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93186567a9ab7c15d1b821eab644217ddf21dc65a9dd49edc20481d9439fa51b -size 731180 +oid sha256:4f2389bf6b6872512631ec9ebcc9a7377781f2e41f85b65efc884434fcdb84fe +size 718552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index aec08f9167..b627d7d808 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbaf88839d17538142e8da5724f994bcdda0fe3ad2307d5d3bc7af6f26c48ea5 -size 646220 +oid sha256:8c2dc4a8cd40a30d16bba216986086cde34dd64e94d2a780bab18029990ae290 +size 638870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6726df5005..f1efb38943 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71cacdad9dd6a2b2a5fb5a125f70c0242a9fb60deff6b5c9e1f52d5ebd9a0f66 -size 761248 +oid sha256:2234221a86898264bd3ed5e463d7a1e551842ecdc0bca8cf5ad7c19de645ea2d +size 746596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 5aef85374c..0105ad944b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8712d823a530489db9eab2862e81fba4dfd4c32255a823120d976e2d64b4aff -size 673428 +oid sha256:ea25c4e05f70469b82aea9d9f0d923d2142e1d0f321d3b8fcbacfe1a131c6e2f +size 665238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f6fb595347..073691f6c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d03ffb488547a5e86345a8836a2da74368a23d3acb9eca07c8d0ad20f03a8749 -size 795644 +oid sha256:484ce3325d9b7d0214104fe687cbddf95992d6dab0bc98c2051a3bd28c926ae1 +size 782818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d328384c0c..572f2dc510 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cf59813dc28b8fd422c6a68d97d0975694c257750777bb7b443e88a6b23c4bc -size 710092 +oid sha256:7d0de56ab977f6766461049a24e1e47dc805ea4b15f6a199a64226a007022c33 +size 702742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 858dc8af12..f544f161df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6d5b2d5498534c33f0314e4ec56a5bc28529b63d825e88f107520c437193a05 -size 824380 +oid sha256:eb37ee25d5a872262c6faf95caf9b715054dc321f0558c437dccba0283493153 +size 809482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 522cfa9134..abf5e86858 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9773d8c0e888964f4dcaea91b8f796b27636dc8723a7b3bfe68c1d10ac706a54 -size 740358 +oid sha256:df2f2401251e4f21fbb5fa786a3a18949cb72ceba0e8727a198b95a9ebec2529 +size 732464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e083ef7d38..7a31969f7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c6c5cc8d7da1ee0fe4da47eb096c901707377bffb27d09310108d4a1733a262 -size 826990 +oid sha256:dedf074597552a90f8ae4452726392125040e46ca2e0ef5ce10620a5b646bc98 +size 812832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index a97647936f..c8e695a36f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dae7590429bdecf18ca0597f7b4ef7812473ec6024bbf140d06837cd3b82e56a -size 733988 +oid sha256:c3945e7c3478116c494c1c6fc6b37f213e963fa64b477998e7249adb58a25fb5 +size 721804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index ecdc11cc51..c328369543 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:803a06e1a7ee0965bfad1d90dbc205ed02ef5c68ad72ed94414f18f10a62ec94 -size 850348 +oid sha256:e45e14ffe461bfe4f60b9eb3cd33db6c06a4fe4a1c8d4a9e6278b2d53fe73fc4 +size 835844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index ac33fd7376..78a656f843 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:913dcdfd0144e4c6d3072da42b2afc867c5bbf874079d81f98cafb3258c1550a -size 760406 +oid sha256:f0f659819cb1123418796484a15a18ec737b7e4b1075af711cb2c87db215657e +size 750194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 38ae7346d4..3a0c73802f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9ece58964d263f72f79997388e26fcdd282ed38d44cbb97885353da552200b6 -size 830146 +oid sha256:beb94ae3ac05dca67195608c30fe51a63b76570a19bff890bc086ecd2963153a +size 817614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 06f10b9d16..4ad67663cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6233a597d5a1681b4023e8f4ae54d4ba841a51f405c764911d169a4a78955ba -size 740696 +oid sha256:9477030ebd92f9bbc9f5518805d0410f5b0e6b206c0070d86a678cf5d8de8ef8 +size 728758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d8ceb881df..2df1638c48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee74d4188a9e4f3a6aeddf198eec8a2dcb443f6af2e700e677345b5f2310231b -size 855822 +oid sha256:2672babf896756ff5f1a795b39512abb787f5a6e5b1858dbd2b1e3d24582d4df +size 841960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 3e81fb2dfe..42a3902ea1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ff9846400bab46473f9e2da8910882d9ee4c909c0dde693823494c9fb722795 -size 765634 +oid sha256:799e7eb63b06b0fb4ca95a94ec9bd23be2d493ebfd89b09c3b6344349088a03c +size 754336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6e6f4e0b08..51e99ba054 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e7347c083d25a6ed93c62d30ce8b76a69bd9497de0baf36c5594f04dec5af5f -size 898802 +oid sha256:c1b7efb1e95242503c97dd5fa54a0a5432b2e0fa129e511d744d21ca1d684bfa +size 884248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0fae9a9876..68a5a21752 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3c2601faccdace3b0f7dd574abce989f318dc1fa0b87a9be7e2eb4971eb3974 -size 809056 +oid sha256:0e3a3a303eecafeffecb2c3a6a7c81816a566d22517f9f83449091dee4e1b24b +size 795342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 79153cbae8..9e045e2410 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:900f36736b38e98eaf4d4d1f35afb8867c860420736b28ad36f67972caba1d54 -size 922556 +oid sha256:fcee3a69c7d144df11c06ce4aa859c7aa6ce9cf43b70cf563bd05bbd89d6b77a +size 906472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 087b77eb50..380dfd8c75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:939b02e9fccd184d106a8c49e7edd3a034f71d3e3b76a39c1baa973fa5252b85 -size 833748 +oid sha256:0015af1af7c49f169b0ed643ba91b9e1cc879645e367709e2610ebc292e18691 +size 822796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0021a237e8..eb64dfed8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d073df4fe7974dd7003c9475e443f8089a800cd06ce014933b017f1d4423ef7c -size 689644 +oid sha256:c40688a70be22378ec90adc778ae3e763c882889325366fb51b99d20d50ebd71 +size 675978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e5d96d3923..00577e670e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6ca44294488a6e5e5bc29931b55e167a25e1da30ae09fc732f3f54e6d96f2a1 -size 587417 +oid sha256:8e8e79546fb6b48bc27cf33397fcc642631d450b090e6256501273fd346ee048 +size 580461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d9bda3d575..b656494224 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0177d50cb8215621326bc15c51e60efaf5ad0d068cd2377890fc9a851b65d775 -size 710338 +oid sha256:ff7eb31695b55206812b8c636c8c132549a29a525a09e134730fac3b3eaf9e3e +size 696574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 0762d9c0d4..91c70a2920 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:168eb7d611e0be4a84f3bd5c5653d9b9e1c9560f6e9dc5fdc95357be9606a0ae -size 607765 +oid sha256:37d15ebf8aec8795ddb399e88f6fa96e5dd2ae78ab878f7fa59cc438a2ade3a2 +size 601401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index b57c76b90c..fd2dd639dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5104fdaf6b866ec7ae3ed50def539236321afdb99a497c07bcd646044e8931f9 -size 684758 +oid sha256:0e6b2e86c0f6ce42fb3b6015b19dd25b9e1a12be7a65867e20b60d7dee717995 +size 675878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index ed3aafb2f8..4cf2fc0e1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bdd90d8951550d6da9da5e22782cb1b763ef32a88696e33ff701b08cf4730816 -size 603991 +oid sha256:91f89e09e0bf9b2e11e2c9daa2a745cc66a575f15187393806cce45fb83754a8 +size 595555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f5d1d8c91e..b1a5a5c971 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:523c1fc1b3b72133843159f891b6c2d27c6211c785893f1a9f14a4b77e62b3f4 -size 704712 +oid sha256:e8971c4f9b1bcab10b877da8c6586b45ec2b476c506562d8e811b89b82b63693 +size 695586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 72a7ec771f..313b2a282c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e18611213276154a7d4847c6bb464008f5af805339833de25205e9bdba37208c -size 623008 +oid sha256:6668ba699d1e81cfb6be886a18435baa271596f201f23d9d5c963e3d8dff3614 +size 614325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e817f8e6a6..7044199347 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dcdc7d20fce6d3f503e00f6686e6d8a63a287beb3226faf7d74984bd1ace8f41 -size 755190 +oid sha256:ffe39802c51fe0895f8c1da1d3064fa56e4f12e5e38104391223b0fc6284e1f7 +size 745768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d994ff5118..6e59bbe595 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16163094d63aea68b28c00403c66dcc97240c7d091398c7dbcaccfaf996551c5 -size 659032 +oid sha256:170b12eae92b5a27cdea63131256d2f2e7ed8f5f98daf44e9e0c227d03165c82 +size 651138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e627a46a7c..8e6ba18821 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50926d15cb934d5ff342e67001a589f923f70a1d369766b3c77c02d8e2ebace5 -size 776774 +oid sha256:8f4819c10ab717f6383d3e5dc4393387ed6ff8c3432751f8d5e68e44e2fd27f6 +size 767844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 9c0dba92bb..08bb1f5e3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7527bab59b80d65b653148d668d0681232ace79355bddf36b68bf803fa4ae3c9 -size 678494 +oid sha256:d202e74123d5063e6b82fdb026acc330411551201129c63c3196f2483fc71157 +size 671290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d1a49c98f5..77190f2741 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2be68b9c9703e47effe30e5523a1566c40f2124eb94ef051ab82a2a9a06a08d -size 797292 +oid sha256:49e1b866db38f1b497be24e208ccc09f4ffcb65d999230dd507eb4aa123f0b79 +size 785944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7d2ed41dc1..73b43016d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f81a1d3477b2eb390fc849850875e102e0801d573a967c3d35f1aee67af1551 -size 696692 +oid sha256:3c4d308a5d290f0751cd9dba7500d93a89cd4788904a6254464cf589704a1631 +size 689144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 548c35f14a..cfabe4810d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec09ad6de2c60936bf8a19b2a25611be0a8afeec3b52dd152edef09a8c70a94f -size 816358 +oid sha256:795fd7d6340cf4243babb892f1985e9730031c28c69c62b0cae312cdb1bd0cb3 +size 805900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c045f1269a..a262124f64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4492d231eb0096f0a7014d0a172c7933c96b869a2b9e5ced554c4c8decc3971c -size 719804 +oid sha256:336f93f01ab2b9ebff55da6cb787c581c24a7074e7c914a4d84ffc63b0dac518 +size 712060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4bbc604a9c..0a6c42087e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4515165a2f0801517651841117f653d5444f43d8d155571d1b4a86f60ff4f30c -size 796550 +oid sha256:632130e5888575d22e4bfb9116dd90c85c081c319377b86a749101c15b6753a7 +size 784956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4df5ad9435..4809f1bcba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0c9baed24a6ba044c6c1a40fd7d34f1d9d342e30d7c55657f448c8e32462e74 -size 714056 +oid sha256:38b229ecfd8d7a3dd0615554df70b1c0510bcc4580f108b1a0382b11e098c06d +size 703400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a7c22e8e48..8c6fd8aa77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ddda3b0cf160a99320a995d2c5b6eac55e0b5b35f64c04ac9163b316fb09bbd -size 820204 +oid sha256:2b24bf7bf5f79670ea6fcd885092713ba105f52295309b4d25ca0185c83d6f85 +size 806046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 1c3bb1b5b0..e8b09c5ce2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:850c6b1562e40b00e7f9df61b7647aab9c2cf12da2eb4956cc7bf7129ebec002 -size 733074 +oid sha256:45e5c92e110b5de76af7b2f9d003d78f52b471c15e3364321e960d521d3f543f +size 722960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index badf3b3492..5182ac17c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:665d5e7ee4827454e077846a391669050ecb051234a67468d4c72ae1c3686450 -size 869004 +oid sha256:dad558fdaa1bf09817005872df73773e664d5d68b1951e7bc5536b5040e506eb +size 855932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index a8417f22ac..7adab84389 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c804997faad80041aad5c2e249d4e12a50dff54d1c78394cbb770b4c4384e98f -size 766728 +oid sha256:6124f7afae9ce473e62ad5716e576bc8d255dfd9af910ed42879d812539a4783 +size 760316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4a368f505b..c11cc6334d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fd71a12e3b8409b0d1c4ca5e7c40cfabface2d7b1a896d1a4775501467766d2 -size 889896 +oid sha256:291caabec01dc93fac381f6b322893111ce78815386bbce7dc78dddd6361d036 +size 878008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 847a8f46be..3ebc3d6c63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b4ca018b8aad4df83364a9a7fe5bc0cd6f81fbf15c15a607c091c88f893d991 -size 787916 +oid sha256:a1a17c534d057603090da149bbf726dcf60114061b329aaf7066887833e8de33 +size 780566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 794745b467..9ed0ce3f0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd8ac7922ddf809e9c4f70f0d3d9bcdb7f314c5aa7e2ddf6259544751f6f071b -size 681896 +oid sha256:58712633ffeb20304d2ddbbfebbb9268a662731b9e0aecff39729178d72ddeb0 +size 668280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a18cfcd363..70641102d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbd07dc627563a420412b2fcb194bc78417e6e3e6f3a4bf63298f3652c44f422 -size 596837 +oid sha256:5322923cc8a7a205d630a37d873b0b423e9c46b1dd0f44c554801a28e1ede2ee +size 589141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index edd519bf51..4d34f19a65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eeed5ac9dbfc70ff35895828d7c504ada08303fa1654aed50e5beb493f90e49a -size 714284 +oid sha256:0e2f747f97f484990b3bc1499d956f1ca58f8695a15c4bd6805da2f9457aee52 +size 702936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 9b3a9ac58e..70b5803938 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0bde02df050e36ce430a7e2582b9a2ab7161ac24876c868e66474482d825bef -size 632628 +oid sha256:acc7be46bc9e9131b93f90fecc08c990bea7980891a584cbee39995b323ff129 +size 623700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 839fd4d072..c16aa74905 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68a41d386c056283c0fda39d6b983252c09f3f50cc60c8181fdc40a4aa9234e6 -size 681944 +oid sha256:6a4082be800731fb3a4186a95e29b2dfc36366078dc3d9a5fb712e44d5277129 +size 673162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index b4ff94b3a0..5951f6dace 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbd40ec7606232a3188c62f5d00d3d06a916f2751f0859e137a5b577c00d3f5c -size 602705 +oid sha256:44bbf17137e76dca92f69eb30dc0955a8b16c7cdfe681aa2065a4d71b33761bf +size 595009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index dd041cac9c..d155a1abac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e6fd0f6ee10f904b699f61241e32303ab3ee276a1dd8eb131fe9f4b29034e45 -size 719560 +oid sha256:c12dce0e0b323baadb5aab3d1eb170b55b407071139291285207dd42f77e7a21 +size 709298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index e63651d7ab..04ec0859b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20087c1ea3dc9f52828b6e6c24942600ce549925e983c4ca7305149fed011597 -size 637560 +oid sha256:afcc4769eda17c7307a652691f394dd2586571857b0b52c982366150570ff949 +size 629074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d0edb28431..587c98c139 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:193e5dcd501308dd27ca7adc9b2a06ab101f85f2de91543d0da8ccf3cb016c7e -size 750700 +oid sha256:e32cb1d2f8ae8f30920f10147dfa2a848ea0b14e106933603a262e58e98ec6ad +size 736492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 63df970063..f3b87c30d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52f90a3b176d98f3aa905a4a47a4daa5ae78ca71de831b598f29081c86773dba -size 668058 +oid sha256:791e12a5a10b719026e7cf32f12b84a4c5ea44d67d6d812ca37df28276283de9 +size 660560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 22177d9b91..00eb36a5c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d93777fe834ec7c65058495d9db76ac6fbfc200f67d44dd995511eda13639f2c -size 783234 +oid sha256:64ed36c39d38760754a95253c188ec014822d1212ad0d92d88cf19939ce40641 +size 771888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 07d78291d5..98dea4b69e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:160e802aed706e5f782369b0c2c62a075175bcc7d0247c48fe9900774325b8e5 -size 704046 +oid sha256:55448ed2c7df948e56cec10e6c5ded2d57670c5c3c6ea5fe6e0b71c59b557f2e +size 695264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6158a1af04..c4cacd8910 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa33f45a24fc65171d2f2ec7e53ac818d20471692b688ccb97f3a4234a8a6695 -size 767640 +oid sha256:cda86f8c36a38877ac70f7a1d1d93d3cb39dbaed73c71748ee529ab08537a1dd +size 756294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 14b775b897..c5c50e1a9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11b02ff543fce07b88f0235f532747d9e37c7e57c0b94302e2c20d33ad9df233 -size 676858 +oid sha256:8bfde73c191fe0c7a2f319cc8eb1c29a88a7a1355645da83b78f809ae610de7d +size 668866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d6dfe9e088..147906ea83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05d9b05d12fa90cb3e8ab12fee63fa3039d3e8d7a036e971863d8c02cf6891ea -size 798250 +oid sha256:dbdae9fd414e8650004ed3d2e2ce69bd24175f14bcc56ed387ae6fde12ea5ce5 +size 786558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 179edb2cb4..f68fb02f9e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:346297f6e2a8188cabb9faa4480c24e88380839bfc79805ce83d068519a7af39 -size 712698 +oid sha256:67cfaf8ed940f5b5fd0b40901d9386868fb80aee5836d13ecb7df968398fab35 +size 702634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 21c49b0dce..8ff3637ad1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea0af5ca2752a28ba14f56375491ce350402b60182b4cdb879b934d52d1c869f -size 766404 +oid sha256:1549f6629b26e721972056afaa8664eaef9d321b26fba155152884e898968dc3 +size 755996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7b68e87e14..bab0163658 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:206f6d49cdb734df9cd3108458ff82b2fa18b3c8618485cdd17fc19a257738ad -size 683664 +oid sha256:ccd01eedc9c6ae55d2d7205851268de56833bc009bbc29a341976bdab6b83028 +size 673848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6b8f52ca97..ae8773d921 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:539b80cc7359cc63ca15e9956966a7173db8d662ed46c7c2fbed836b8bdef5ee -size 803478 +oid sha256:82c0b30afa733be52677407a282184e4d022f7062ea35deab2fa4aa339f909bc +size 791540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index f12b0fc679..9ddb96edf8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70cb1f9ebf38f873c93f15640ceea7393cb0f61425577bdcf9dd2afbdb0da79c -size 718074 +oid sha256:fd4cdaf9b78cbe401f09e979f0d21ca9d77059287d997e7e09dcfcc03ca2eb52 +size 708208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7629eac641..5d8fa1e52f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33cd7ce4fc0b1a2e62a5760a0dd8c7871e38109ea2f2fbe57a2e64e2184ef5a0 -size 838860 +oid sha256:dafbd0a6f004a4204643b136d0c9b1840a5466056cc7fcbb7a40294fca0a4d51 +size 826182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e69f93d8a5..31d06491de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16a7beba25443f58ca54d494ff157c788fb6a6ba5060f8cb074b948ec69d19c1 -size 750002 +oid sha256:04ac8f0bb0fca60333de4a39f931cd9e39648770533ce063921852155cfc9445 +size 741566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 181e676964..41197eb46e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:569b11fda8e319fef4c262d93e0e547b09250647512f83f80a72752ab0b67790 -size 870458 +oid sha256:163e4512721b59a1f7e24b18f83de1a9d0453d5ba17cfb331178e3a0d804f151 +size 855410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 591e59db8b..4fa44f962f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:574f586158b7b792931c86f0e7b5cfb327b719faac0b9a5dee24af8ce2e6b028 -size 785942 +oid sha256:dea929560fda508ef81beeda6dc2127efa15a1bc008774f007f8883bb05bd7aa +size 777062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4b41edaa5e..9ef0084715 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84438b6eef47b974a8008fca16633ad1ecd3c13fc6d05de73031452bf8f6cbac -size 765040 +oid sha256:dac64d6f42ecfc9e79af3d68d8ad558dc10cbaca0a21386cae64cc4d936d295f +size 749844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 92a7d9ef51..b41c2d864f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c128c7dc0794f71784bbbafba49dfed12e69ad074a8f1c3d9f8eeb0800891447 -size 673222 +oid sha256:1236db489fc7f5e4492f08b26215908c31c4f2e32b436401e927fe371e4b98c5 +size 665280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3e3dec2a95..a1b4a39c65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f73d7cc3eff62dbb12cbc2b46429dcccf235e24c185cff9da579370dd9053e27 -size 794416 +oid sha256:cb9fb6cb0fd6b28666f7b1b8363705e26623a7b0cc3ac571b846037339160a02 +size 778876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 25c5f32cc6..3e3b071fd0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:732964e312365f85f617e22df3f1027d628003f76f43a16c7670d98422e90e07 -size 701464 +oid sha256:b7c6ddbd06408d43cd0835c3b8dc884a72c825dc3f00b6bfdd7428b48edbd081 +size 693424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 9c9db0a7c6..6514a2914c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f69e4548178f64991366516fcf0abd915b90495bde01c01868660e9000470bf -size 768490 +oid sha256:aeff0c7b4601a5e6a8254eb1d8d65a67b663408fd3c0e8331d1c02892dc6fe36 +size 755566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 68ce53b84f..49fdbc2bbf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbd709569fc11639ed864c4abfaf89c26dcc9b8df5900cd9b557b7c342da93b4 -size 683728 +oid sha256:eb73db9c64157de94a9b24c9da4e5e52e5d776ee21b48cb47e68373f5df02c46 +size 676328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e6b0566d51..7a86b3ba83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2568d89d6ffb6f4a87f5d39015b8d97c0e1fc70c39f462547cf82735060864d8 -size 801026 +oid sha256:92a052de5f533599379335e75dcf6bb342b9d6575a934d1b50b375e0f401b78b +size 785190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index edaafcf085..dda6a43314 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d98f442b125edda6406925cd5b462a69538ecdca1cb0c50f509b7e10cc221d7 -size 713698 +oid sha256:ef0466bb26b5b43498b12b72e6c2a4653c104031f02825a34d2f461725e2dc3b +size 705310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6e261f23bf..73ca2bfb3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb0b21c7a53eb0e2510b760f3757f105ceeb6eb6fa8f8dd13aed4342a9715996 -size 871108 +oid sha256:cd9e0dc01138ca6099e3d00649fa50c112e015d48b099a4b7b709f5cd81c2ba0 +size 857246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d840a538e2..7b51b130ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b221a78d58d9cf3309c7a4df571e7059500e4320d3bfb889ea03344f8812c9ed -size 775936 +oid sha256:802827b3039da950a80ab70770dec97c392dedf3460e52ceea307894e41600e2 +size 763010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index aab4fd0145..8b30465b8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e34fc245ffb057babc31eb75cf0390876749cacdea0886d94975f2bd852c321 -size 899154 +oid sha256:de13b1ca895d5123ca55e4f7270adda7b4a94cbb4792508acca2fffe2d344c31 +size 884650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index df38acd358..b449a608f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81fcfe50efd2622d2e035bdef00ed6438b6df447b84e7028f851eea9b4b8f124 -size 804178 +oid sha256:483a7e84f4d487ee4eda3a4b7b638e671bf870e78fadc6a216c4ae9b718cbe8f +size 791154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 718ce5eb96..cb26f056a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3d11b81a335c0ee189ceb198655ab926d404e08f1de4d879b3af3756f229efb -size 874954 +oid sha256:9de3589493a8e4550168668cd3d617632bde8992c59bde579b4374e82a15f47f +size 861140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index bda93a86fd..d87c1bce4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f85e48b1789cc7adfa61207e1133742bf8f349c8de68a8dc123a46bd35b6ef1 -size 799170 +oid sha256:d4cfa6e48001c3ad662cdba640bc1df2f8f5c61dae6be9ebbea22a3a4aec1b10 +size 788120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a770bccb8c..0e7024db02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f443d3379dc5c97f42e88a2d2f6b38ea52187104b0b25415465c3552fbf480ba -size 905516 +oid sha256:443732a2e12a2f8e666b896ad6ae22f9b1a8e624b2046ef4caf45a7731d8bc5a +size 891160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 310eef695c..2968e13503 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6a3217371db2ee831d297a2ae3acec1ba41325cc7894e2c02302627064f07b6 -size 828152 +oid sha256:71378b23d94ff2f30fd70afcbb0e95ba85fc2f9bbbd81a25a8b68d3f011ee318 +size 816066 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4133f96f86..6e1f1d6850 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ddb2e1cde3e2ca63857bae94c7a629841250363ac1330260f849692e167999b -size 765044 +oid sha256:518c80f8fbd7e794ea9b064eeaa628f5fba15c7bdecb6ec4ca766afb27a58551 +size 750836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e1aba7f82f..c4dc78a9ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7cdfe16003879a7a808d3701992d8e9d1e783758d1d567fea14af4ae733056a -size 674016 +oid sha256:4717563959c924d5ab2500926987626ca70c3a7f1c14531964a7b5aed3d5f4f1 +size 666072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 630ec28c60..4abb247fef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eefe7a4f3a6c5220dbdc9f736b444d335be45e05bc5d6c9e8ce2e5f99712fb18 -size 794272 +oid sha256:8e3109a1b7f1f9006921d4ba0a31fd5a506b13f06684c3cf5b4d507fc7d53752 +size 778980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index e04271db3a..13531f78d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a5980944b6a2e56b626a036fd6d6b28d21df551b3a6c0025427bf521923382f -size 702258 +oid sha256:fcbd4c47a7feb167d40a03578d0fb8256f08efc19c6c66db97403b0c98a1221e +size 694216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 17963e65f0..64c0acda2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bd04f67bf53dd441b54f34e13f9abc83cd067fc2c15b051c9ae70e9511f5140 -size 769284 +oid sha256:15a1d638eab787a27b68560c505f83f0abae244dcb1b2b6d838d7d3049499320 +size 756358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 6653a751e2..b042ee7304 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06cc475f6fe29d2b7ab41d70701559c250e665a69ac35a7c0d271b328b6d6306 -size 684522 +oid sha256:fa7e8947b7aacc07c8237aad1b935637cb7746062664bee4f3eef90e0e8bae9c +size 677122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 55af9fd577..805f5c71f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:704ee6de6ea6a6e1d88e43591f682c914ba16dca66c98ac4f7701741120ecec6 -size 801818 +oid sha256:d3b0fecddc9d6f50e0d4a936b70bc0e052fd67c58759f6264ac44f1970fbf0c1 +size 785982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index b50b1ffd85..3d95100d0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e7727be4d7d00801a94939c95cbe096695264f91d1a7c4eed33058d7bb95d93 -size 714490 +oid sha256:a0fb3400ddf38dd0b11348ba498b45a90252fff9e123a671c9f0222428826b14 +size 706104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 327edfacb4..7dc7d1636d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df6eccc8f9463bf141eefbf9e222136b428e70cd00f2800cf0f4ca362a1e964e -size 665266 +oid sha256:e3dee8db872c5aa4576a6ed764575e2c4de3eaf847f5100ef71b9938e0808c8d +size 656978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 4cb1d12a99..068f7f9eec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5d509c9028ac349c89b87c5b36db533019f5d7cfb0d10a34bb8dab0190c280b -size 585385 +oid sha256:95681ba1abc15a78c3718d41bacf59d3b51ef4e9f68b3558c0e9ad67ad781892 +size 579761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 20ebbfd0dd..9a4aacdf5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3392710e46009d3eb2c25eaa8de23ea4ff996692c3271cb30f8a748af9c349d0 -size 692670 +oid sha256:4248ca89c5c9bb3371bfe085bba93e8a7bd0cb3e40cce85432f669fa7ae3c6e9 +size 681174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c4c03820bc..c7519becd0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bff8c42c5b1c4724684d11bfd8e55b253ec1e48355f121178d67be3cc18874ed -size 607215 +oid sha256:42bdc1316054e922de7c0fe512f22127d78dae1177360592aa5bf29bbf83b96a +size 602529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index b8ffe1cc16..903e66687f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d688face18545f4fdb6cdbf82d06dd5849ada130e80225e7acd25a59a9f3c2a6 -size 671726 +oid sha256:5d0f87e862e517a5997af75bd8d547ef09bd5d6411054d3a2b61b2bd61c197dd +size 663142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 6c2e8ab09a..16a46e04fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06007e0cf41549083a2d429eed48314114bbfc74fa1e9ac3a279399c47798f09 -size 590909 +oid sha256:619cc3bf125f0a74bd4bc4ce4f9b07573ffe0bd0c6bf194053b8b1a415c5fee7 +size 585137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d53a410b55..829fe46d54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:229541cf660937d7053f35ce5cca5eb2abdd194bdb092de36ca14436e31d8385 -size 699180 +oid sha256:5fb214c3dbf18f2019fd2c7e26c5e70aa55946973e1d76dde1ec43a5e6b7fc7f +size 687636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index a59e645d86..19ae35f8b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:173a6d4a3dffe42e8108a733ca466b6cde9d849a27b6e5a365a43717f09d53e1 -size 615403 +oid sha256:17f410f96af40062198f0eebe1169f0cbba8e546b1a7afc979e67adb15720657 +size 608397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2a6de38169..ab3da1c39b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8460b094a12a1f7eddb34bc5e74d951930924c6d0af586748a1fce82c4fd1e3e -size 735202 +oid sha256:11c3fa6f1398af83fb2f7cfad6695042d3c9f2805e9bd3fd96d75b3cb17f42ed +size 726322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d8b9eb1a7e..6e72557d34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:247e2d41bcf740ff4577dbfa545501478b97309e4f3e430d6e7ff3a743222fff -size 655718 +oid sha256:c45ee3f93a06291b064ed589ea03bb0803a0cfc809aed0b7248fc2f9eedd9e3c +size 648910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d3912ac1ad..32f566d4c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf87fb0121b927ab64a4481214aeaa7eeb2dfd28012b222299f3e78eaf1a09f6 -size 761620 +oid sha256:7ffae40bf53fcdb2112bed39ddef61d270960a272ea75384e931736a7ac6e3fa +size 750914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 230fc16b32..66bc805992 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b488f1655610a102ab629a8b8643da6e6669fb627731f84eb975d069f603fc89 -size 680508 +oid sha256:41eca49caa819768aaec830add0445f246a528a969df77bf04da5ab9286adc1f +size 673404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4227bdbafc..45b29d1cb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:020db6a3a1b10e756541352edfda3e7be7814aecef1367bedf01f39d6d308cb7 -size 750516 +oid sha256:27d5ea62c9fb9d99bfe4b3e13daeb3985d289b42775817f107ade53bf33f9b95 +size 737540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index efc793cc35..33991644b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8b70d49d79f188c6bfff7d55d04edacda58706c746be2574ce9f4aca6f7c16f -size 667282 +oid sha256:dd958a03a8bb5e98b84c74aeeb149e0ad5e0860f6b2282c25c02758baf9a2fed +size 659142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 94afa35c0a..0098687b71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b563b6a6a5e6363a9f79064f1a75a49941391cdc44165cf6ddcb5942c779ec3f -size 777722 +oid sha256:5fc49ddc05100712135403b672e9bc551d53d7f15837abaef57c6baabd7c85e2 +size 762528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index cf99ab06bc..72207219de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e7a90910aac556ba090adbe3cb54014c0caae772e06e11ce6cbb71cc613ff47 -size 691232 +oid sha256:920e63e6a494d1abeeb1be87d46f1350dd576a6867e3ba30d6a43309a8c497d9 +size 682402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 820a5d45af..8f4d08c0d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be99591409b3369f5a2c2b3854576b9551ccc5016a5818549a8f73d276e103f6 -size 755052 +oid sha256:218db96b6587a741571e622c321e07e70513216725c4a7d96d2c8a5e8bb1c236 +size 743754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3274a74ee2..46c50d889b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f97b358e9698b7b34697030b3cc075f88b047e0ea87994b7c98173fcc9b399c7 -size 677294 +oid sha256:9ea4d38c57b5deb01a6cf3bc02d2274e260b49b42db0c22cbab0b56564fdca7c +size 669894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 0f23dfdfbc..140100c566 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f087011bbe465b58c5bd83821d03053177e74b7e69d6d21d02123c6107abb5e5 -size 782406 +oid sha256:108cd6acd007627417fef5eb8178f08b96cb434cf3bc3cdee08248a2430055d9 +size 769432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c95dd6fea4..48f7819d64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76ee46b62dab9aefcab786c1aa3d6da8c8109e24be80418b06dc87854f2769cd -size 697398 +oid sha256:5fbcb9a0580a471808bdfdc81aa1db704192854c7085533cb5ff90694035d0fc +size 688764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9c654f378a..11845a3b6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e6321c850786df4e2195adf8337aa69b48507b423e1c6803653b7159e951627 -size 822476 +oid sha256:10726fba45a911fbd1ffeb73a7a30ad5aa5d98df4437b3990091839afd13d66d +size 809796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e1f4c66141..42a88aaa95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99c69cdd5dddd023539ae05ad84f32fd42e9d48bbd449546d3696765ab697250 -size 741560 +oid sha256:d48fbcf1c6967702c049e6245d7b177bc356b9ea1c434c598a6c53a37ccd746a +size 734604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index eae2b02492..a84acb5bfa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a10b7150ce65a810e9000cebdca67c80ba6ad9c8dea1550a1c042a6fec4b4c33 -size 849928 +oid sha256:5b278b979d5b70d372a46fd5235ced9b9feca9ff5992e7ef45d46d42a1a5a03e +size 833944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2861f71293..089ca0fa8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b72a98dfc7d9979b13089150bbd330bbb5a5d9244366ec0d65ddb82352ba7383 -size 765018 +oid sha256:8302523fb16de6c2a044e8b9e3b8bfbe6f26b7f1c0d407bbf045341d2d2e57e3 +size 756582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5ee209c178..63fe2370f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e31e108a43d2ec6d14bf768f2213e9349c0bb072954e11e20b3a8a84520061e3 -size 664822 +oid sha256:f2052055196d331d33149380bc1648cb2b366fef27b1c166da4a7ec198dfca10 +size 655842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 20602199be..83f093df85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52419591cc251c2beb7a9aca2f28453192275ed900f5fd054f437567fea10268 -size 574977 +oid sha256:f1ae05ec6138f1f6cf2517b91cbb24dabf150f44cf041a0d20caa2a171fa58da +size 568761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d688f0fbfc..7d372fc525 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58e6fd4f868c601b7cfd788a00ceb07c01dfca0a2da40efb03e1d62646e7f8ab -size 681964 +oid sha256:7bfdbae9a0a4afa5f0aee32d740b448d1bfbbee268746f5c564d254077d40793 +size 672196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 243cd0e3f4..2aa31b79e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:744d0b58ba03cdc9c8e12aa0cde016de6f2b84b637bb118b9e77264b2e16b82c -size 592021 +oid sha256:2637748829f70224478bfb35682ee7da1739f66d11db944efd782251430d8bbe +size 585015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 39bd6970cc..df7e4c32cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d894ef4afe5f743de90839f9b6e54a17b6c99fc630c120d9b3cb2f316a925e8e -size 663092 +oid sha256:f3ed5a4c94b845f05fc9da406a8605bf29260c69d9ddc2a0c06adacc036529c4 +size 655002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index e82a40102b..4bbb98c99c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b542c57f8237979198df632ecd780c5c8c69e61f7b2034eb768f2232252d5647 -size 577639 +oid sha256:7893fe3d7d6f8ab661e6d2b0305a593c68ac5fc9a9a99d823f1fff0c5745b8c5 +size 569203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index bada6191b7..7baa98b6f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6c9165f8d66343a04511501b2835097984cd1caa95f9bd150dc7b4655f67ce7 -size 681124 +oid sha256:ce07ab75dec1ed246bc54e01083464f63b5dd825aacfda123343a094fee8ff30 +size 671454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index f6edd06da3..46242a752b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f65c1ddc15a26af7d5059df3f68aba0be2a0b8b3e14ef2516241cfbf83170183 -size 595471 +oid sha256:496de34171478dba2ea261dc0ea94ea1d043f8ba3260921678ef8909bfa7b0ba +size 587035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index afacf4537b..a8169f3158 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5ab983bd8cb0404de824265822d904c2c3c1416ecad3d4f1bbf2c2dbdeba5a7 -size 732982 +oid sha256:2c78c1d974f4823d16629f919673ade513e0bbdd4d40ac3cf66b4350001953ae +size 724054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index fd9bc31511..6cd6ab8ab6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74915526992e890263fe284f900ab4c4adac08400fb5b58afe3e8e12092a73a9 -size 644964 +oid sha256:7ba1386b0a1d0cc07934b1d628472d29775850f312212c488489d7742096b410 +size 638058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 9802576c69..c480267862 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db690c87df7690f07d604f7d741ca4cfc89e68c697498a0d897bcfc94ca32d0a -size 750224 +oid sha256:b18d0849eb8e40f9297a352cf97e547f3476f41f4771cc8668c9643a4de8fcf8 +size 741294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 5d9d15ef40..dbb1aea3d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b62fe946bb532b2bfabb0e1e38d7c7156de4a69387f3f5e4e253fd3ccb1b326a -size 662846 +oid sha256:cfb7c1d672a46b08f1b0d9b774e508b1b56647183f636284f21648172f77f149 +size 655940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index fbcee6c95b..c72a281b5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05240d0cf4d34df3fcacf3ba75e3b8fb19a2215b59e0fa1eb2a912fd50677ed1 -size 753278 +oid sha256:0c6d6deadeb0eaab5b1c84e66aaa8f795fd07ebdf7d7fd699be409c5c3168466 +size 741636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 340ed5ae04..d61e2f96a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:063d0c81f341b52c7b38864fc2ef925610def11b1ab122f5adab33b9fe4c5d70 -size 663632 +oid sha256:961f4ff6b4276b0a9c3c66f439a27853ee350c3cefc7dbf1a3f8cb7010e08945 +size 656824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 1313dc9c19..9247c4b62f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76984b16f9034ecdce9042172df6c1705f71aa787e862d1c4178979adafcb5d0 -size 770618 +oid sha256:23198bc47e2886ea3663a52395ba03cfebd96e40a6021c7ea2a360f298abc2ef +size 758926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index b535e73e56..605165c659 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3daae3f6403c2246fe34aa0458648f054f957ea9ee707e459777c1150a92a1e -size 681416 +oid sha256:d465c926ec0864c317caa964b3cf7936b741e91bfe8347871efd344c8a4963b3 +size 673028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4d5f35c5ec..942501f429 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:148bca9b77e1d94e0a1a6bb03779d1d9c03de0f295c78c5001b1f9ee22c8443c -size 753474 +oid sha256:c3ab3f24ee2ff2f0029baf962e6b897b0226cc327ac5efc54d5337895bb2d519 +size 741930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f3969d25bf..5d65c98807 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c611575855333311063d9e66190acf97b0badc3563cb05d12ba01c685fe2acba -size 668612 +oid sha256:6909eb82c2a1b3e7792467237cc57571f21980e410446c79b3224dcb41f966c1 +size 657068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 2ff2fb4b3b..b97ea7ebc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11b8ff998fba20fdb034c11b906a01568cfaf39189faf8c4e47d4bb18b9d72b8 -size 775402 +oid sha256:fb2a1f4f3baa6e2791dde00594f40d0e0a7f036c2c2423c9723b6bdf1f9e178f +size 759072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index bb8b01e4a3..3b610f1c61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3484eead6d48f65d857df3691b318a20d86d60bcab55904134c34f86b8c1b092 -size 685656 +oid sha256:162badb864a7b3b7d69bcdd36bf2996700ef292ed43da83ea5ff0b145ff50b97 +size 674210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f2ed0979a1..b878230453 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a08e738401a4751c87f58f078ae3fe75c9a342eb4d9016e3d2ea8b9daad44a8f -size 824942 +oid sha256:fe6557a687fc874cc7590af332b9e7af641a75abc245c08d20a3f58eff7c1a6f +size 812954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5d565dd1d5..469bec7451 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:296a74e93b69ff7e5dadc7c602ee91a177778bef1ec81d734ad2a8dd535dd5d3 -size 735740 +oid sha256:bdbf22d3e3b05fd6e586ec5eba5d87a306a9410affd4d178585b152484756adc +size 728142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 61408941e7..1808931b56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f91fa0af7531a0f497850c7241f99d5f8e6c86565e12493a5397f08226c1489 -size 839470 +oid sha256:8d4df71bbd472de8ffe5d56e266da90db5c7b490208648d8ec2d543e8a3bbe82 +size 828716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index ed06ac3fbc..f84804ee95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a72642ec1d28fd1a145f91148a02d166cd90703f99e8d2d05cb271ed88f35705 -size 752586 +oid sha256:2e4cbcbdf2a546fba2a84d34615ba8c063942e711fc0f60812dd996d306ece66 +size 745186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index fb72a8f8ce..0b6d08b4be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09a15cab79cbf7be26bc65452b8b17cef00864f3f4bcedd56f3231675755d545 -size 653670 +oid sha256:bbbae4fd6af311317ae56da8dec4a7b52b56180bc1010f264809ca04910137df +size 643804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index cccbbf760e..c46ffa1053 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d590f50be54fac27ff5ddf4976f8e6bb6c69d4f387c05b5532abedbb197c8d6b -size 570831 +oid sha256:94c61e9ee695c24c2c9f6eda3ecc776f8e4e6b66aa11bec2350f9db78d8b4442 +size 565995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 307ab47a68..0297dc2f37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3eb7cef01533c78f494c1551f601392151eb00e3ada1396a9e8ee9c830a672ff -size 680334 +oid sha256:965e1a08f314c5ce6a5b0aa930b343cd0b8ba027f4e15a0f263bae96b2679a96 +size 668790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 468b0ccec0..d78a67836c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b66565a00c58f6cc44437ac3e6b4e0cb473b00c70d5c01271147891123b30b1 -size 595571 +oid sha256:d4dd63ea36c4c5de4e6a0617cad11860d64f29e4fef60a9822eaa84c5b1cb813 +size 588467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index c907dadef4..e06ad6bbe5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d6ebbc2a15675ad4214d9ab2d8be82acc6923f3c5199992d95e4aecdad4c9da -size 658602 +oid sha256:dbc780cf7c96e90a9bed1dc74cb6ba83504b023fb7740f6cafe34cf248621845 +size 649968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 46adb246c4..f8a79b2597 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a683c97cf5edc5f29c92b8b8db176d3743b2aa5100e7b7e6cac7a25528c56b4b -size 577193 +oid sha256:358a6e52cdb9122d63776f2a8f57a0b022256e4f0f668ff6e9843700a54293b6 +size 570581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a964ae2fc3..05ef05c65f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:366ed851fc30a8110fdb66edbf22a2bf3e1d18076601f59ad3d7f653db4f0ab5 -size 686054 +oid sha256:6fd22c2342da8ef1b729c2cd6bf1f742c50714680a07565dd900823b82521572 +size 674462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index bbebc34c6d..9bed9c4301 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e0c866f8cc2e7a8be461f9a3e9da6a964982a285cf8d92493871fbb388ae4e2 -size 600847 +oid sha256:6217081db14a890d868c657c70652f9fe6b368c1bca2a164dc9c37ea144fa517 +size 594631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 749174b563..376cf88c53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8162c77e07aae4ea747f3a5bfef6985a9f8ffbe5e0788b7c3af0dbf75e28eb97 -size 723656 +oid sha256:6857199f5822b66162d6ad4575b54e95ea232317294bf4b8af97bb6c6421477a +size 713938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 44ecdaf9a3..1fcc9d1937 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44b0275b835e6deca5705ce520d8c65a17e5ad433a5e5cb691164f99ce700f1b -size 642002 +oid sha256:07e80f87bfa9892991c4ea087266709dba89290bf0979eedf4693fab16f73db4 +size 635144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 99d151c07f..22123e524c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8fe09cc3426a4550c949715556ce284b68f317e68358e747f5bb8d720976b34 -size 749284 +oid sha256:25c9aa902fdf3660925149d4a089aa6894075b9a791d5ede29df76239017c943 +size 737790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index afc07e2567..0989644f44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac20ae59b12d5f111304ee871c48908f30bc8e095e955f95de1192777986dabd -size 668024 +oid sha256:3197d4e60c0107255c340064fe5c5f9d5898b8795cc97418427241314f537fbb +size 660920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 922feac291..4cc522b050 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec6f9a9d4bedbb571eacef14bc96431bb57b51a55d4eecd35c2d29ee36d6775a -size 738970 +oid sha256:9da7250ce17cb39dca52342647294db22839e414dcb02e2e98bbdf8f00b59374 +size 725600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1b0e5ee054..011c7f0f1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d81e41258bd25d53bd13ae16e605768c661af1e8e3a49e2390485fb0c66c0f9 -size 653566 +oid sha256:46f753c2a22db646f14d52af7e5804858cb0eff80e9408c27d6e1c063ed2e5fc +size 645376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f78aaeb92a..58c7633c39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f3634b6e1c4581c0f1f3e7bdbf1f9d425a3d63c17c8d48e9cc4398114c30907 -size 765386 +oid sha256:84470f27a69d5d8165b0cac8e1d6c51022b1250157fdb173c6e0a2c4c405dc3f +size 750142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 868b9db794..6bf8732d19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b5895b782ca39b44e2d7add1c3f8e37b6c9e6445b049f6af3ec896ac6462d98 -size 677466 +oid sha256:5e7d5b798eba45f7c1493f04a231187aa53e554279cfbf43731560c9945dff81 +size 667846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index db6f9626df..f6ac5db15c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb0b4c1051029ea89b87f00e4565611d9196ce16f8b5eff0d5a0d5addcf10466 -size 741928 +oid sha256:5c0eaead9ddfda26d333c3f93fef81ab38018aee9969318a9b015533079c9fb4 +size 731370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 90bc307e8d..82a6d89950 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c32f914d0b055b4bd4d39e74f84eb19d810054fa3f39eb8cc14e562bd69d20d -size 658200 +oid sha256:ec97478eb5957ea1e2b68c8d3cc4040950c1f8021bc81237dfabaf961bd70f1d +size 649962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e134438a97..f2c2e873a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c918f1c3c7cb4b1247f8b232c505a3214dcb64c79674747877a573d6c5c5c131 -size 769232 +oid sha256:90b9864a0ee02f81916fd7ee7275de9ea19b16b08f142264bedf919aca0cfcf7 +size 755468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2295d973ce..28bafee3fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cba7e056dad71c4712d8b62bbd92f013f37a720cacacfcc6bc18e4f0d9e9f647 -size 683632 +oid sha256:66ad6a4256da3aaa3205d4658628e62ad9c61baca80c92b48048713efdd6568c +size 674208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5f39ba1552..bab2b919b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90327bfe65945da63ac346f8b7ce030525e06c855fefa3609123cc616e7ef6d7 -size 810140 +oid sha256:2727d10b229e6765700841404ed43472b1e3d57303e026d9709475dfa2b24700 +size 798202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ccc67d4485..29997d6a64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1aa17719a920ec9d86e94e83bfb096309745cbbe8726e6293f790a697f9b13e7 -size 724490 +oid sha256:1c42b7dda278f08f796d49efc7066b0eaf833d2cbf7880c63032b4c3a604fe01 +size 717534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 25f20492cd..6d608ae91c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3170523b9758a9b7d6841d2fad4d1ea97ad13f86b760afae2a2e5088945a885b -size 836804 +oid sha256:a76243fece85395ffa1c3af92c5e2e44303168618e90ead1c0b6c352159dc463 +size 820820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2c4f61a73d..d67a8f7cc9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c25ead858a78bb3396edbda02d1742314e1860c0d468d7ffedd1b9b1e1c7341 -size 750414 +oid sha256:f7c3efffd40a544dc1dd204692590fa256c3e34830a09ccd31469700c94c2a9b +size 741188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6ab5c1f485..b0271ac57a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5a93112b20138465fbd298334167340582a8b3fc8e08a91a833694be1688917 -size 694780 +oid sha256:53f6c6fe6f7ae262b7bbff063c1184f5da96f55de30808da684840e5c9e086d0 +size 683336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 30fe4aab71..d90f84a727 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a17f8268d101fb9ad440478b9679c66e43e1cc73ba9bec6f0f8e7e61880df7d -size 607303 +oid sha256:39b327aa82ffadac06d1e244b67ff4a8b9363562f2ffad9cd31501db58498ddb +size 601531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 943bde9556..e2630e6ef4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cc612ee5b8260353ba849fe2e34c0775f963f729cbaa6999c7c28cdf186c62a -size 718928 +oid sha256:8e0d3b28ec95bca284650975e8f665a976896e3ba607d943b8f35964985bb846 +size 708322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index cb5cd56f89..fbcfbdd70f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:170bc0edb5fa1cf1623f6b6448b83298e99627028734c0f0361972bd246a9389 -size 632834 +oid sha256:d11cb6fb549b7729980cbb36b36b25fd99a5bd6b4f378fdeedf0ea8dc3fb74b2 +size 624792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 933dbba60d..7a01687400 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83e6ae9fe46661b5ac5eafaecf115ef165e97ff60c705bc008c497feabf1cfa4 -size 702574 +oid sha256:db0894f941d9b4a58e17ff36b1ef20b56fd62ca59a7738e8ffe39ed51500fd5c +size 689550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 76364c4a8b..b8c68588d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a831d4fd1278441c20ee688ec8f3c9e812d937fe1bbc40a275c66bcbae580b74 -size 618896 +oid sha256:a3b1af405b8a765bd3b7a8fb48191c718cf4e681455fcc9a92cce65d583c6d92 +size 613123 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 02397eec81..1708a1662c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a93daf8d550389e226f0f1f31da87609b73b4be61b67c6093b9ecb73e43c206e -size 726326 +oid sha256:c65c96d212eb7e6f4375c2956fa7838f94b37191e2311417c145de94d9b55788 +size 714782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 3b94ad8fb4..9ce5927049 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bff472a6fb6fd216b6ab0e40754d8c9bcebb2de74e77a86dd2c67196b0fffcde -size 643390 +oid sha256:5251b8241ed766709a84d2e0a1958c7c5d20eaa81feca36e26d32ada8c34ace6 +size 637174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dc32e2f8bc..b2f3b4af9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:522811f73031440bd4b5461483d1e4545d22ddd74c6408b3c81e45e3a35864e5 -size 783780 +oid sha256:e78b8c706f5a53d4c1774aa54e8e5a6cd7ed80d2bb6375034f1d3af805296dea +size 769868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index aabadeb2e1..ec1174a6a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9abb35d2e7a27e1ef212b966908ddbfe7f66c9313e39098668453232f3756b71 -size 693048 +oid sha256:ef31aaf5fbc1ea8da02d2666694390349286297a795240a6f444c74342cde1a3 +size 685154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 11407b3724..ab019bdb0e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f07cc5dd4233fc26827eb78135a8d655744158984a1e0db3173d8ff043551be5 -size 808470 +oid sha256:229ff35ca0e495153c4c7b82e411e445d90caf90704a16ec8527718096fdd9c7 +size 794066 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 745ebca73a..0ebdbdc5c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:038e49c57101476ce55edeed1a390d314970ea792a1d843a4b46811d710d2a12 -size 716998 +oid sha256:468adac1b64d911968e11686b05dd9fbb7b8cf0890b5c1b69e5fbb562a6dd7e8 +size 708414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e5db812660..8476f1bf64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23a6ae13184e2a06152922db8245502c2729fca28803634e6d713e1d5f964b5c -size 791672 +oid sha256:1edba4c322d4ea0a80136fbbc8dee49d74de2cc9f49c19deb58e9e7cd1642b02 +size 779782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index a11649ab1e..4f9ddfc440 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3915b1065927382d34ecd7052dbb13cac2cccccebb9b97eddbfd915eb8186b73 -size 710708 +oid sha256:b7347f7aa0efd4b324b55dad1490ab780fb9eea576830ee887c78ccf8e13b08b +size 702568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 36239ebaec..0443e4af6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:081df7b61e0f043c39aa8436f5a886b1060ebe71c482142c466d9b299e6fab5f -size 815228 +oid sha256:ef94ae32c8ac082b5295a7662cf57eeba79bef6b77aedd60dacf7e3aaef9ac4e +size 800970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 52d042dea9..4eda1d0272 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a7a90aa10d7f1cbbe464668e243e78620a4a43f7711018ec6779d0cd6d97f1e -size 735200 +oid sha256:1788cd946c7e2a710df04c7f7311a513d78887451999d8c3f79d75276ff48113 +size 726568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 7b55b2322a..e3030c14af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a7119051359a57132b002afe306fbd9e61ccc00e0b3d32edd17e33e838b8492 -size 694784 +oid sha256:23c7357c0e20173f44943bdfc4f4ea3d5b5cfc526a4f4d724f8bd0e49a619d9d +size 684178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 49f17cb4bc..aa293cfa3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e9ad978cde70a54794cb327bb94a83e98b3ebaa0332dbf413912399755737cd -size 608887 +oid sha256:aea1d9537d45fe3caf282b3211c3936c04a1e3a71ca6fc9310c2de5703e64286 +size 602325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 765998a923..4343b590db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19247f6b1b2317165c333e3d73a3649405e5a0705578bd51f622c761f1ff12a4 -size 719624 +oid sha256:192ee499d541cbf3bb555d4ff24c6737e7fdd33695aa5d4d1c7a0db1c55750fe +size 709016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 04e035d5f3..516b295dc0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0705b57820dfb5c6375450946eb644d9e80a57e4d5929581f690d840883fdae4 -size 633628 +oid sha256:bcd3bfb9f490737bc6889123feba75436578a0d45011f7823d616100d9cc9b14 +size 625586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 05876e8650..468a4629a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5d244771201af33b295c8fc705fcdc530631d36008236fa2eac94b15c6709ef -size 703366 +oid sha256:935a337dc7bf1b46f7d7b510a13da0ecc2cf28bc89716bc2bc05425bc5325c38 +size 690342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index b3a0c07ea2..ca10847cce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17fdbd370a4f4d3aab9ddc3a5c55fba6d048e76ea4bd41b66d82ea224cc0a75e -size 619690 +oid sha256:6beba3f31a708f28f47112f9add405c19f0f14ea2f53d9be1a104a2e4ce38f90 +size 613917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d47eedb4ad..72db5fbe00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acac50da9263a3992bda6fdbf2dceffe5d017d6b8cd72134fd4dea16c2868fb0 -size 727120 +oid sha256:fc7c3484c10e5bf5b6aa389a201543cc035d00a3dc2be97d942909528640f4c7 +size 715576 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index d8b7ee340b..72281f4782 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4acb39e4c47f22e2647d947853b92b61b3e65360e8be4e42d5c49cd685d5bf2e -size 644182 +oid sha256:07b8d64dccdc94cfe2f41e2776acd097f442ebba58da5be98d691da0ec8fb09e +size 637966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 83d544ca9c..58cae55979 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26ddc4de7e074c3357730756dd483bc4d3338198f3fb8681d70b96d21e136e95 -size 683954 +oid sha256:d0a28d0da314391abbb1fa94d1f300cca15fa0ee4d794a193d14cc08c73e847e +size 671670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e9ceaa4c14..5837dbe132 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e06ddde16220e31102342c02881b47e19aaeea0f90d7dc93b20699524d08a40 -size 599191 +oid sha256:6c91a6bca862675b747cac103b858ba03ba0ce917e1fc1f0294c557030dcada9 +size 590409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 596194d275..6860798e78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd36b11751c3ab4415021ae2a9d1fae950c32578df8161032284fe819f03311a -size 710570 +oid sha256:271b7d6e1b7750cda97f70d3c33f632acca7e4f5d67cb064633487784276b7d4 +size 696658 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index bdb7e1dbd2..21d9f04934 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c223334bc500a0192508dcc560d603d4cea9e8e84d60624c5e0a3719c5e1016 -size 623094 +oid sha256:de18b959332da2b905d84696a4e8c41bd1762b6000205962dcc0f974dda17908 +size 613275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ad161cc6dc..ccf66e41bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d59ef93324122fa8cdabe95ed836214ecd107c22ba09fbc30ed3712575752878 -size 690662 +oid sha256:31df4eeb7a16443420aaed24beb95cc7dea5925379da9a309d25e11d5a6f9d33 +size 678624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 7013124493..fe156e6983 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02688467c3fbd32e61b8300fb1b3e57b7b354e1983af4de7c64a0edbb3f27fc0 -size 605503 +oid sha256:02ebb1f606b8d05aac7224130ed0635b2a1a95f1b6a2f2450a90174474a35a45 +size 596575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a8d74322dc..3c01c5790a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3c6d6d77a5ce68dd983c44d94c35ea09bdc24aab92c1d2286cc3d6b31f5687b -size 717326 +oid sha256:9c8f86de40b833820d394b838b34e83e071f7e81ebb8fd96006ea179999a91f1 +size 703118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index bf361f58fe..8a4a9b3151 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66b4400e2ecf07a2777cbb8b9ea87c33606cc9550d69c4ae8126efedd856c30a -size 629948 +oid sha256:a253aa87eef9f9ce276beb9f2736e04178e6b1bb3894cb33811bcb7acd2848bd +size 618996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8c8fbeb466..dc3c3541ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0361244fa1d1529b755447017660c116d375febaa2dbbd8ecb4fea1ad3c56c4 -size 753892 +oid sha256:08cd75808ca7e48fe34c8266dfdf5b0d50a2e0d6616a764d77c1a165ab0cb99b +size 741016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index c915f1a08b..89c17ddf8a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56e7ca9583f4b4279195712b31661e067a509878a8c7d4eccc4f9ad90fe46d72 -size 669474 +oid sha256:db5ac753f164e06c38a5d39c82de3f55c7d0329d62519231257b296e3fff8a72 +size 659558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f43e8a2a52..6e515f0de0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b11f2f99e1e442c6edcb20e940a4cda4afc1cebadf82937bb1dc522f4fcb5599 -size 780310 +oid sha256:0ad8ce21f7e24e3e663ec118dd9746415b5612ac5cf8d73c4f7947ab6abe0886 +size 765608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index dabc9a2da6..9e5219a5fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa31aa235e6d7e0ae7a13802a8ce67ba1dcc0df0bcb9c4e4b2cd0fd9115ac402 -size 694264 +oid sha256:5923191ae5b1044dfff0deb8b595838979c1d06e86b95590c872693b596fa52b +size 684150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index beeb8e820c..eadc78d429 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c40b114cfc72fc06e97217fd416771b37257aae7b587307cadea2cce0610b178 -size 769204 +oid sha256:07fde7f642a64ff1e2d747f3f1ebeeeacfba510194ea40c1adcc34a5e26f3197 +size 753418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6e9d00af3e..7f26956464 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2762faa08a4941b88f668e5ee90dbf171d04633e9b3efa8c94651de528d4560e -size 680792 +oid sha256:bec8b6e313791cb6208d398b0c922bc51b35d2c624f6999e23978862ee237cdb +size 669790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6bcac4f125..89317061ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc7cc09599e577d67fd23c81165f4d958af0e6cabeccbdf4d6fea541e7c97704 -size 795622 +oid sha256:afc840272884a2d7f8c00faad25e718352e6e304bb8dc2def892f12f04e8b4e3 +size 777220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2c098117cb..01ad7b448b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a09e8d41f4b680142d16e454b4bc18cedc453f796a49b4bc945c6fb1841f4f4b -size 704742 +oid sha256:cf4e2ba716c991d63abac757ef2a2d1a6c0d604259cc74d0eb47a4563d994df3 +size 693000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index de4f88fbd9..7550f0c03e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be7a1ef57512f54353f1639e668461cf011a5f927a37e38c91b1ab04c8223876 -size 773988 +oid sha256:45815838a285e8f4e63813cff949edd3b1469890d5bee122e49766eef3fc281d +size 759238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4929992b46..6156ce52b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c400c5ca6362b1657b51eb2a848c47b76e37c57b4e8130d232f04b9c55fee7c1 -size 691100 +oid sha256:7079df78754247bd8e6694657ed9562d0b374b5a297113b57faf2e3e3c031412 +size 680542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index be4f12ae20..09fc9ef02e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccf5790af5ebc5f4455f9b0dffef8817cf057a08e3a25764c447ff27de6e2605 -size 801342 +oid sha256:058c9ee711ae792135204b4d029c3bdb05a0e4de1429379f8c00369831111531 +size 784126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 6156e777cd..fa464977b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5b411b322d0d4e5b50f1a647170dc97f39a8e72398455304945c9293b70ad24 -size 711154 +oid sha256:79503fd890adedf43a911fe14623065269cab738d1c9c9a601d837fdd0ebf0be +size 700202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b30afa46bb..e84a92020d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b000b38ea88a3be7c875aa9b7399f2b89046321968481cbbfd90402f71c6688e -size 841164 +oid sha256:4a11e4fd9a3de11158ac654e6ac380799d111949db8dbab4ab4184c4ba0e5090 +size 825280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 83230e3143..410d12ae8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:973923f630276873becbc2cb4c2cf36e4b19a06dc6aa5d473f1acf0dbf977214 -size 751962 +oid sha256:2c076e7cc4c1df756cd751bff43625dcc5fe9891a73146a42ad2e942ac81e573 +size 742688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 48c13aa5ea..f6dd3fa0e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c41e2c7956c48699602a9f2a6db04c616d0c67e40f42783350d7936bb05774d -size 867828 +oid sha256:855b70dffebc193963e28aab403ba9cc5ae9707299deda3bc3b5552efc2fe5d4 +size 848638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 16c7864fcf..05ec8c69ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1bdbc030f129535f1889e96517a196ccffce5dfa5df2692b9869d23706372cc -size 776702 +oid sha256:d7d957e144d70b331578c4cf9ebb550fa55c087c048b859509ac1a00cccf738a +size 765256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 212b1ef021..4aa169f115 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e63d74d189fcf857f2e4c8427fd3f8f66609597c3f8591b97182000525d5adf -size 672114 +oid sha256:4835b94a00200159fadb1e4732fb218913820452baee91005ac1164a06b8c706 +size 659930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 4ab88b63ba..cdcde286e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eaaedd731a8e6bce766ece794c701b546745eaae5734288e6db93637fc9fcb44 -size 582961 +oid sha256:9beb7c556b932c33b9c4a1fee94968a40e7e4375ac8131c6652a477b914ef066 +size 573439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index aaf4c19afb..51c7d0b483 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8691fed45c15edd3762735d819df94f4f721cc2e935eebbbbbc1f5780d71224b -size 689258 +oid sha256:151066c5ae40269766bbdc69a8d36f30a0506544ced3e7b6b7dc5183195ba959 +size 677072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 5b5a0f2b97..f006d5ee66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c932b3362b8ff26cb91795d5cd8ac0d0d224ee4d95935122053672bdbecb1391 -size 600005 +oid sha256:61060e06486b2c873e9a37efab1edb1434d04d6efb5069cb5e25e6f0b6b689be +size 589841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index d732478365..dc265d17ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3be0239e938017c783751286e0ac376c936d947f58c00b0df21e5594f8125df -size 671176 +oid sha256:0e9091a712b623485413bc6106e2f65b8fed4e8e3ba629afa98d18e6c478d90d +size 659878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 0bcbcb058f..53ff16caa5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f8bf12c09470ef8590b8b9b51cbd2661af6286665c62d3c8d9c5d5d84e58d51 -size 585671 +oid sha256:ada87c8443183b495b844d2d345cfff443205a25daea12a0145b838fc2622cbc +size 573881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4a5d1da298..e27b5c8f2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30bc3812258c53b37e9c512220da974957441c24b3658073425c91a8e0d9d2d3 -size 689206 +oid sha256:3e4d2cff985bc4b88c9eebcc4b717c289eee0d4902c28af286f9560bb9e8296d +size 676330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c740c0a8d9..c154f3627c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:564e0dd94cd89b01d443046bf3d411e89651ad3c028a594bd4e74e4bb6543144 -size 602765 +oid sha256:4f89d48c1e47e3334bd17e8d25b56d97d029d31472615be016b73a6017477da4 +size 591813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c74cb0c683..d577c0883e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bb9e69e16cf38293bba455aa19bc4fba818c39dc1331352faed48b239d51f37 -size 738056 +oid sha256:fd3edd3bb74d671681c9809e5074e46e09f1b8c0db8a28569c6da00c863cf5fd +size 724934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index aefd9b875e..2f2794145a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2435ed27c51f39a0cb7cca3ee0a0d5e6085038ecf8f533202304968cec17eca0 -size 652258 +oid sha256:0f7b1adc2d3b571b19749d1281087a680d096ffd8094fa14f07d58e099f2c092 +size 642786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 22052b0f2e..e6dde1bf2a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:438847cbd81153b2811cdd290d9869012fc4c396cc5da155a7bc36f69a2979c7 -size 755890 +oid sha256:51c54b1c79ecf05b2962f817294dd406e637196606d465b3c1c44be911f3059f +size 742076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 3035d166ce..c853c00d59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a4a32858dc963fb6ea2be428501a838ad5f357ce141900771cf0f5c912965ec -size 670930 +oid sha256:fea00852d5f49cdc6e27a1336f6f3b977254ddf907c3c32a5daf1d5d901901be +size 660718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7882b251be..0047898b59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f55a3ce6491892068cc6e2eb34e571e2ec3db1b6dc3a0db282b3e1cae5fbf96 -size 761360 +oid sha256:c61f22e9ab7db64a0e68adb220469774d76f2ee8d46ac4a99614fc99e83c0c76 +size 745772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b94948d5c6..a1d7a2be1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1d5f49f360b3e52e42ee41c949b9af68f6f335c7f76dadd811f17d4466afb0e -size 671714 +oid sha256:95a5761db0a9cd1de47da66c494ff4e9f629b508debafd0f7d855e98241dbb4a +size 660910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index dd667a3b4a..be211574da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98e8a55b1ddc762fdef033c44e135dcff99b8a7711aa065bead9fff69acc5ec9 -size 777912 +oid sha256:b35821d76389e22dcd6d6a357f9fafb8be1d35c44fa7e4b492018e917badab1f +size 763012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index e7973650ac..e7d3d7a570 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4ceaa1dec067a24d98b199122d4cc319a76ad06087b743e03351ea590d19a11 -size 688708 +oid sha256:b9837eb12d826e57ae98921e2d4d7c486305b7443f8c91761803fdfa27e018ae +size 677904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ad78b51bd0..8ae248d1d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66a70c390fad8b9dadbf65dc6ac66bd30e13b2f4815bfa994a4404f1e185d1d9 -size 760766 +oid sha256:2185faec258e5ee7c7ea44f5f478ae05a904df420d9a4278c54bcebd4de07130 +size 746806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index c03fbdd314..4c2fd72a53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:615d34a819c2bbbb5d7b3f83c1068584a1e3d3a4bcd201bc427ceec0bc73f114 -size 676646 +oid sha256:85ef27870209f1b951c02dc3834846e1f35b4e8be8291dcf337995ef00eff5f0 +size 661746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index fdbf29cadf..7696e78c9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:401b78794b1755ae87465b9ead2974d41dad4081f6c0f5905decf7e169be4a2b -size 782646 +oid sha256:32f93f0007741426ccd40f90344406b2ef8263b6678c8bea5d708d2d2700f95c +size 763158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2b1e47a82a..0bdf68c574 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:974414c1fc3dd8b9af6d9bb68fdcabf489829cf1f03d7cc8602c47eb62f42b2b -size 693738 +oid sha256:2d4f6da867ff8afa023704b371189f4780e5b62e1c15d7774c9f504be613532d +size 678938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 62c01e60e1..3c0399a2ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dafc9295882852729ee66e0297829a257e4ea4b96a34ba18ee11e3ac53a3c887 -size 830212 +oid sha256:d65ff9deb06fa68f17d7d913de02e343ae51889da0ad77fea5395c61099e245f +size 817830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 54ab54aa6e..a75e5fd19c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:094bafb07129838b9db800d83e47247121db40d35e0c7d3c6caff9ddadeeb1fc -size 743032 +oid sha256:9eab14076bf1b586b7c40be510fe8b5238c667d401c3899b2d554de625c73980 +size 732032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3572fd7530..c5091967ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b38705fc54d41e3135c15062dcbe3bad4448d2f14a9cef56f0e7876775a1ae5 -size 847552 +oid sha256:b0a257a84c2a8f5d7fc1f1054022eab1e3fc914942229c7d36740c9884976ba8 +size 832802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 6a4c1c2ee1..10be92022e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5ddf7e849a88ed7d1fc7c256ad6832b1441d679e936bd3bc004887468757d03 -size 760668 +oid sha256:e6b19250514520b518405c3368eb089e40d4ebf4de9313adb481fc5a2af77714 +size 749914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e0a2487686..4c3a2b4022 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8fd5f3eae3f35bd407316d26e0e776f45913c4c3f5bbd619184c89f1ce1be74 -size 705660 +oid sha256:6eda491a50e93cc213d2a02a9226e62201cb9fecd822a2f5f58446e86992d0df +size 683460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 85db6bdc9e..3814bffd4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc53605d34493e6720668afde848e8ea8fd8d0952fa2922cf920bfacbc92e18d -size 572203 +oid sha256:e80ffec5678d79884a5247cf7b50f6fc81550ee92eedfe72b372a0da832f8e48 +size 564211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c8d033122d..b45694777c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1182fcb0a169c181de5ef2e3e6a253fe786d0c849dac8225d1db9fd861685605 -size 730596 +oid sha256:5be38d94e4364faa24df2481633333bb34433f6cad090bb83aadc988d454eeb1 +size 708446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index b9602b95ed..1c7fec36f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dcaca66aba111fab3470c638b931a0802bead168187dd013698fac0a1e1d5ce3 -size 596943 +oid sha256:8592915a82bfca1aef0fc3100daef597e6209e76b11c1834662d72cf074dbb9c +size 587471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index c9fd0840d2..1916904667 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67ed4828cb3df8fc70a98ae3606913b7d5c601ba505e6022552837539d5f9596 -size 711430 +oid sha256:c14e83c64185869459045a3b4be40d8104a0d9b2bad39e28180038521610644a +size 690216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index ee078fbddf..0c41e94e22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fac8199c45fdb8dd6323e87cfddd7c7046fc45d92915c7529934812cfa63d21 -size 579305 +oid sha256:01d9648a16963cf450983c86d660b91f4e0fa567ae87bc05e9793d7bf036fe0c +size 569587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4c5df1f2ce..6cc363a019 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a84aa6840a69645a5ce646970f2eb2cf3d048d887c59f26222d74124f68ec116 -size 738142 +oid sha256:cc994bcf62c2d2243cd4001f1d4973126316bceed1645ce39e87986a7c57a379 +size 713180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index abc12be868..a9e3100cf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46e9d9af034ed596dd21ed1be2b47777cf4552aa70374efad500196f3605de97 -size 603009 +oid sha256:101fee30503e17d527d07446dca77398f23028a3349bf6a07f165b26f513290e +size 593587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2c5c429553..2023e8d918 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea51335dc342e7148b236cf774d42ead6336f140216efcf24423685fa34e35d6 -size 774808 +oid sha256:00254c0a3c64745968f2b8033ad7abb73948deafa185d43ad03b1d97103979eb +size 753594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 2dd04fb3b2..613f921adc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:672b2e46b3f4cd710f5189549f443c3477f41264029e83432c303873dfdb8611 -size 644116 +oid sha256:20e55d29438b1ac88e85c3912817640d82d0f370e9474844fc1c0b37577e2c52 +size 634150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7837f54bb6..082519ba03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07ba200530e0d59890af96553e823f53dc6d800bf4f16d79833bc78085486623 -size 800336 +oid sha256:a20aae040cda9170c3875c66580308ac1d66c2d32678544e82cd2de85e90a2b9 +size 777396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c818ae7989..94e09d89f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02a7d89024473007b264f560a2ee87b184d8f5b4e54af6d27a088cf68f149cee -size 669398 +oid sha256:01f901cc39e776fd22b110978145be3bd2787bde2b83d78ee1341c2c644dbf71 +size 659136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 039d82d8e4..5cc27de4b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:882dd70563e83500fbfbb2d5f1457604e6951e699e35585c0ee95dd9bfe3a9d0 -size 789972 +oid sha256:eae3daf7c87288c2c63e1efcd54ec5ed854add442d32b7a6dde190d21223d6a8 +size 760422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 33c89d3097..c714012a07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edd6e9505c528ac40b5df2d087e2e239efec451cc2c9362724f2eb313bd629e1 -size 654642 +oid sha256:dd22e980ab50072c17f50b11f611e19d99f6d316c47e33b3074fbea3e59ca07d +size 643542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d4fa6a5913..84b6aecc81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:405fefa77bec54d5610b8d7c086b0cb92f9dc9192db0003781b2579ba7ba87dc -size 816340 +oid sha256:e54b9aff09f5bf5e633d082892294df09ac22afb03e2e11803106afdeb0f1281 +size 784174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index ce6101a64a..6d0a7d28f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e80ac21e69bdf2db537dadc66dcf584bbefa545d3b936d9ab0592e2d40a5232d -size 677804 +oid sha256:b7a44b5b9c0cf760e319db408a4ab7b49f191a03bd4fb8e2df5ed2ea9226055e +size 666014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d75cd5688e..3f38668c00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5b007516a7f4c9a5bd783ce132f20e6ca159c87a0c55d8eb4937d6fd7803a55 -size 795150 +oid sha256:59c8a193e4c1aed81876d781d040d94de4a5f7d72753d53300f479a682f65a43 +size 766684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f5dc489d29..abc44ec224 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64f875f5c86b0b11086aa9d4cf70b17f05e025ca7f32c1f4650e7e088f6f0034 -size 660314 +oid sha256:f8ea91ae48c4283d1cb087de78ac458e514aadd8b4fd5f26721ebf1a0dd41212 +size 648968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f9a2728591..a4a229d627 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40dc3b0854c89dc9c373b59a268fd470cf3a65549a3c8ca92fb2f716e7fa55ec -size 821074 +oid sha256:c9b45115d7456e30b0c84f88526f1a075ec23ff6170e812f51c5c3fb9a1848ea +size 791622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 29dccf1dab..91c7f37ffe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32fd421d3abdabf3d0ce4dff467607597b18ba9a4f7695494003bba33719dbbf -size 685004 +oid sha256:a87b949d8d17bbfc4290e4f5d6f80099ee524a1bda76c6b264a6e5dd0c7e3881 +size 673214 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ac051e014e..e54307ba71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15a059289b08bba0460a52130a8e093619f5cf2384c125c6fc34cf6723a2aa8d -size 861142 +oid sha256:eff4a84feaa7b85c98724256c115c7f7b607e9c9a43275d0a212ac5d5b25e6f5 +size 836822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0a06c8b6c3..1a13153693 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:305a2d2ceaf012b064cbc7593fed338fb869d72a9c8be6d2a07456ae0823e919 -size 725814 +oid sha256:c634a81559f3955d2177fd582a9758cd6e71bdccfcca006e7a6239d5a7f9038f +size 715700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f189155782..11a2df9249 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36fd4474262fa1dd3f552a25b2711f1362e3c0fa47a98c3a74af79861bdead82 -size 887806 +oid sha256:2c385ca98c2ad478c3e974141479f7ba5e34d926fea77f7caad292f7132d26c9 +size 859342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 755ebdab37..54923faa52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d499d7433b1e3fe0d45fe1f77de4efbb9a4b5d564a97cc57733cd4fc5d4f180 -size 751738 +oid sha256:3d1a98236d4fe17e6d38e823d02dc9861f5941cca117cc4bd99c20f9e1fc2978 +size 739354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4e27de0b81..4b204ae327 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e2fc2435b24b232dd9d66b87687e1201599c5498b36bf9142418a5c992eae98 -size 697768 +oid sha256:29c57d35f51833537b5a6af3014a86003f9d3801a03dd557f67c59a664f1c7b7 +size 688690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b3cddf7a10..6f5de6c304 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f7fa19dcc2bd06dae9a37ae4a12e0a03fb8f91330262fc6c6981eebae7c814a -size 616901 +oid sha256:85f331f3d7abda10e2b02c49e9ec4b99d4ab7e1bb887734b8b6daa0eeb924372 +size 611327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 10347238ab..693be65de9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1aec77b927153cd82eda036596eeda959772f22215a9cc62491b155a2d5e6570 -size 724432 +oid sha256:b466c3eb6608116cfdd44c1474113d76fb24d503ab8f722257ddd71e81a4e42a +size 713678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 38bc794443..6b850a9c5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffd190052b3237f3f8e482fb1718de251738e95bf795b81e76f38a2c547f017d -size 639176 +oid sha256:03cfe944356271a2987c6e0a91d4a41014b5bbd70a885ce0c5f805f9e3dda881 +size 633650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 3809a3ec47..3217b49b7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aeafdaaa2f9b587308170b37e42b09f9dc0d655d43c95df43fa12dd6ba00adf8 -size 704278 +oid sha256:c0fbe6b9e6dc2d50ff492cca65767141a79cc863f1bdf51c5bcd65ba1a5e2acc +size 695644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index c87e4cab8c..c6d5e44e1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6cc38734f89a1516fc99def6ada1852c45cfa4cd283672f45139f99f7218e84 -size 623166 +oid sha256:410985516e32b155a640e36fab1a1a3dfef0a24e5b9dca24fcbe261d0f0f947b +size 617343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 39b279ebb5..b01d26b000 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0747061775005fa71e4951a8af942be2f98f80df9ca4a872f3c1814f295ed0a8 -size 730942 +oid sha256:a3283a4ae5deea1614b1c34eccecf608948b4248332c923975468b3a85dabedf +size 720138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 8086559b5b..53fde8f3a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff31c501f73ddb1316ecddf30c06f526997f31f9089660a3991e4bacd9829e61 -size 647314 +oid sha256:02e2b86d524f0d0757118e3987116a43d0980bd6d7be41dd9d24994707424d14 +size 639520 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6da1e42da1..ea3d350768 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d97937b3500f85efc8e77ffc9ba8ab59369e4b41dc4b2b8c12566b38b70d9a4 -size 767754 +oid sha256:72c807d8d3ec6e759bae0efdc895b250024747b25faed2b0797503d23ecc1c91 +size 758826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 687a90fe43..422ad1660a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef0ccbc5a2b6da1614b0c0b43d002514367e3db237a4be7df8f6ed02d89a1383 -size 686890 +oid sha256:716bb8fa9916d13d9f6a835b587a77dcf516ced054da68968ef656fc053b8d1c +size 680032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a09f2d0b17..e062b97520 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:740c13cbda6cdba048edc5111f5b8bacd9761fdfeefb72a9449ad36a30997f02 -size 794172 +oid sha256:499bdd5183a867aa9142f6914fa0258c2a930bf66751698fbe5dbf944b33b1bb +size 783466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 717690a3de..0ef4818d93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:357f26f6f9cab805f6bb7b0012e23558769b84a889cef17abd451cc8253c9153 -size 713702 +oid sha256:8ce1cb94954ea645fb1fd7e9fb6e21b6464c70338da109a32e535f88340948a1 +size 707386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b1362376d4..28166e3eee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b207361f9984d5688da24ed6146f431d4688ebde55375271d6cf2da88800394 -size 783068 +oid sha256:a9f44c0dd8c721330ac55f60ae58e5cbda9070575bca2b8287f39e76f0ecdc28 +size 770488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 741ec3402a..527cda50fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a529669f02aba0c59b3bf2b974eac18850992d7d20c596479e6f20db4974f278 -size 698206 +oid sha256:8313726a1dbef799b259168e62caa2becd866213faefde24f19604c791cb2e64 +size 690264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index bffc98b4a5..a0cf19df74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bc3ff7222fe25c9219ef40e8675c44dd7308bf3ad2fc5b8156bdf63bc850774 -size 809484 +oid sha256:8bae4e9a018b7db294b141c8d5a677411cdc3f8be2d2adc7229d0f90fd422062 +size 794240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index fd1f80152a..bb60c61fc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2115fb095b13287dd8ece255b48a48e69ce8f5edb7d682f9bdc4c793ece2f22 -size 722600 +oid sha256:f7bcc0bfbce9160ae785f3a73c0bbee266099834db197bf47754eb425641df09 +size 713178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 72c601c553..e24bea552a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17b0124e0c9468fc7ff55182424c402e0a3dba8b0bdc941485b6b44afd76af0b -size 787604 +oid sha256:7472ff90ba6b9398ad2d8cdfe282b21d7c52cbf59340b1e20414a92a2207e6e8 +size 776652 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index aeb3297939..292105c847 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7520300fb7a2193ed3fb220218e1e6714beb5e9e7674affcb885bda15f5830a8 -size 707922 +oid sha256:ff4bd67302ada9bf0b0ba4c6e595b295a8564860e877bd1f94571c8f23d70052 +size 699734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c661cd4934..bc80c5b857 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e9d9010da52ce2f69d3be761b5f732691e23b2d3dffdad72f3f723cc15bb492 -size 814910 +oid sha256:b713f966c50ad134cb3bf6a9b32692b7291cc6f9af164aad8c141c5a77ca7fec +size 801146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 731c9daec8..08d1f6e4f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d1e323bc89eccb89bc94ca3a270977bca53e4c0cad459d83e33b927f8136f16 -size 728518 +oid sha256:717764076a477e88184d7dd762a6c2a9b2035ecc575acae4626adb0bcf5b382a +size 719886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index eb5aa518fb..25a44a5dc9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9474ad20fa7e89c95d8f106d927fd22173df4e2defe0815401f036051aacbea5 -size 855028 +oid sha256:49a00e5b5289e63485b0e8d12063bde902bed61b9600d861dc7d8f78c78e3f0d +size 842300 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index cb86d1dc09..52d053d828 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a53625d219b7dddc9f81e407c0e0a26db3a6b1c632cc8c46892d98a707b01507 -size 772632 +oid sha256:2f72ff34856b288fc1fa8787b9dec5dc4753df677bcfc0bb4aad3692407aed5d +size 765676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c2140f757f..8c4e2aeea7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c5cb0fff0c29c24f1f1fcf5237ee3583d894dd9d19a8c1df170f44677da1bad -size 881692 +oid sha256:3f3249aeadf143d449a8c8be8117ff208d12398f6f2ee1db4e65bf315f536794 +size 865708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 4def87cfe4..761f932370 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccdeaf7b7466c0cb8ab74bb1b7201dc5acf4cc9ddb1ceb9cf276278f7db65dde -size 796928 +oid sha256:f068d6728c054c5244c6968d11c5fffa48d9ef6b7fd9672dfd75c05ca0bae27c +size 787704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 87c4ec98d9..89d5a8e3e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:318a50b94b68fe84c3c9c529047e9d677bcf9c5e01fca2443b6e6b7aa748d01e -size 698360 +oid sha256:b9610ea36863123acef22fffc3b92ea90e7dc98050f432052aca344a2faeb45d +size 688592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e3d269165c..66b04172c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d43a794e77c3a4a37fdc7960a17c946d99ca383bea8d498938c137412ac03f2e -size 606147 +oid sha256:1545b4b6ed12a6a9564d89668adb87eac462327d45f09620bcd2c6b638e2da24 +size 599931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 2c4e940c83..cd25fa885d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16850eab77ea7eafd6671be1add84c99cc06f2e5318b9e0dd5ec6fc901268c55 -size 715502 +oid sha256:de105ad2e4cc51c1a6f2cb9d94df6e04aa4b01015d05e9047648eaaab5fb3b90 +size 705734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index d358387442..3091408fc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:477dbd9b0dc8b7ba2ca246159a04ecfb4efdd3f5191e5deba50ab800694f497c -size 623242 +oid sha256:113d0c5cbf19390dedbb1706c34e40a48be3375ad5909919d29daf14b483bc7e +size 616185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 376ae0494e..e0abebd845 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd35b4dff7f182ddb13b838df2876e167bc53c1026658bbf1a9552835446fa2f -size 696680 +oid sha256:a2d9476438466ea238bf6e22aac8d3f6d7ee6b866c1580c6e5c0810aa22c010f +size 688540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index fcc1fc9cad..91c73fcd16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:716c7beb7d92b7b3044891d7677e07b485fb88589dbfdf5b507eac5ddfe6eab9 -size 608809 +oid sha256:d0e3d235f09c1d236465b10637c12eba7c08d5ad8ad356c1cf9e063d924040be +size 600323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 56d58371f0..e3205cca1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e397f79c68de51a1a800d58ca447b5f75833d8bdf168f06120307d84a0529f2 -size 714712 +oid sha256:bdd8d1fda86616fe21b6afe77affaa2fbebec8c16b4dfa395f9f11543ddb1156 +size 705782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c74d9b302d..bfac4ad04d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8f226068af3057a7e4c0d6a07fd6c5e02af001e65f6563edccb8a44fac3a0ea -size 625904 +oid sha256:b4371bc43444ece9b5da5efe81bf46854eae0678c338d5495778f2ff4727a19c +size 618208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8c8ba0b2d7..14cd6f6792 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d9d5d8791084302cca013af991153bdc5daa5a35005a567f387277ecef1de3b -size 763562 +oid sha256:21bf4d2d906257e083c7b8e3eb462fb67ca29d6dadc363a1b77b9fb2fcc2c10f +size 753596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6e900298ff..160b1ba8ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:720a18c19471afd18c46888061b3b6f9aff82480d4d37ca355fcc23c06e46762 -size 675394 +oid sha256:6465241efc4f7273579e76e5219745dd07ca50988bbde49c85e563d027bf5d59 +size 669228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 05be9c8c28..7f20b26600 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8332fa1673b57671ff621187b381f454746095c0c64c3393a2a0fa8baa52584a -size 783762 +oid sha256:d1c09968fa398a1b9866deb00bbcad4a8537b0ae440ec19e8d395041db719f3d +size 774834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c569dc0093..2d8d7aecff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dfc7b5c6432eff9d0d1c9b4a3e2eded11edea251c53d84897945353766201d7 -size 693228 +oid sha256:d659d906a5716270ce7795dc9bc5558ad8423f06905232cd3543e5b8d7ef004f +size 686322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c641f05ec4..018744f451 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:607980cbbdaa19f40b6931dcc6f8563602276a0c78369007afbe54b98813bdf0 -size 786816 +oid sha256:49901e2994fd928c6371d30b257c3bfd1be5d2436b51d830986b7ec8b97ae7fe +size 775224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2ad1c887fa..769f10b15a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2ad859baf66596212c89d645d4ce8113705868857361586ba78a916c36a0039 -size 694852 +oid sha256:9530dfa4ff93ced96ec73e9b316c6d87da24724923f0ba0cf2a9515bef821a43 +size 687204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 70387807bc..5b0805f962 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70eb053d2d8d56c1d7d4cc835418565a97719d21d0aa562e87fdceb9ca180578 -size 804156 +oid sha256:578a4a3c85644e2a414a725a413524c8b68cbe47f3806452d87ab452bea0154e +size 791676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index a5f386c072..937df68e0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3187a560e4f2b698dc10588cec6fa71389be3f993925b6619c311d0804621bb7 -size 711796 +oid sha256:7511ee6febb446d82b12a098dd9fcf253b02375261a0a1c3d470455709ed820b +size 703410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8b8c0cd12e..f04f525b2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd8838e8a7439d15d3f924f05a2083c317b43e1d9248b1c8ab42d0f5c1e22383 -size 787012 +oid sha256:b9e693f9f7c1291b9be510e244bedfa8dc22abfefc6a78b7722a4c24d8e2f84d +size 775468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2cfbb7913e..3ef1a43732 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9eceea59dfca43f6b69f335e57a32ed40916ffb5e706802b6afe2bfe8289c22 -size 699684 +oid sha256:b3f8539b2345934f4adb858aa07017aa8f61bc46565abda5652f9a3eefdaf2f2 +size 688238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 698e88d3dd..b86f6d5c72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecd51f3e8e498833a05f627739fd38edcf5b04916f276a374231cb3504af54aa -size 808150 +oid sha256:9102b8500b9265b80aea70a7d7d01525f9e48e35d6a3027cb559851a6add9e7c +size 792610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 98be216578..7c3fb40635 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:186185f3bda1bc51fa03209b9edae3d04179dd52e706e11d7ecd65caf0ef8949 -size 715988 +oid sha256:6491bfbc69d2728411f196bae846f9db5031447e11365955c449a07eb9ab9a7f +size 704592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2af8f59ba8..4fdfb60295 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48ea42e82fea232be3bcb1c86e58f13f3766e42aed2d548f4c582b05c771043f -size 856458 +oid sha256:9b93fa585d8a8ade5e7d42a4ff35b713f438cffa8774fe03a83f3c970c7ae07a +size 846542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 331ac4d4b4..5f7b352f56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bea24faac9698358057b773b2ecf337d7ccc6935eab3f903a00a82d86834b06 -size 766022 +oid sha256:41ed43dbe9bdfe0f6c67a8f7e98ef7d2f1b3ea19dbd7ff500fa6e9340058247c +size 758524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4bd8e29585..8541ad8db1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:816be327eed158378a31c07ea082b81955ef8e8740b61b281e495e6954ae3b0e -size 873008 +oid sha256:2db4efed331b79eccedf9d29190bddc3eaa9c420de37777606b3511197c2bc9f +size 862254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index b4bb4057d0..efa2c0072e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18177b493b23e7e983a305d963dce6d75855596f4b7ba2233c9f74ced7a54217 -size 783708 +oid sha256:23ff28f10fa3d5b37de48d29fa1a1d14a92d27b743596c9a4fd0637e3879f9e9 +size 775568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 03d79932f6..c16db28d94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35cc6459dc65dafef86ac66b3ed4cc0c65b4871210744bb1079044a21fa8a937 -size 650308 +oid sha256:d969df10ccc1f39703290107fe20266a068a97687d247db9d20c0a7c86364e6d +size 640440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e87dbef90d..b2bdc339b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ed07dc8e155e4ccb930f10dacd8f496f2dbc53df48c310f1be413bf38fb24cc -size 566679 +oid sha256:b3e9d99d43251fa97af3cac8320010f9c98f16f4d63a0fddb5edc3ac7c397202 +size 561843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3d3d07c854..21315ab3a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4c75d9910526139a994743211dedb16b9e6d01e5c08f4baaecacfa9890235d4 -size 676132 +oid sha256:5b8caa5bf2dbab6a08987f5d259ae4c55764ecf7db33b692991379193e948cee +size 665428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2256d0bbbd..18a8ee51e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b24be372365d43b4a8e0744b8bf0b9cca145f8c6d2b3027b131e08c8c6828a02 -size 591419 +oid sha256:69ed8eb5cc338ed8bf172071a8b5b7b947ca5fed53e426f952a869165b3ac30f +size 585103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 4594a4768a..bb18d94bf0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9be18bf52dfd74845d3addfb638a352341c3ae9fd0d5b35c63e9d405c84b5da7 -size 655238 +oid sha256:325c712751e74289e37c0f3279fbf37cb3a7caf44de5459360600aa4cd43061f +size 646606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 826b91ff21..1a173104b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc5a6206f5d41586263bff0b3342be2f3e650e8c127c257ec21e638fef806730 -size 573781 +oid sha256:7e06567ebaea6f4dfb880a798d60d731dfcf13a357dce99516e43b6c64092154 +size 567219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 0a8a5e02d4..d6ae964941 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ec76412344d9a48ee096a2afed2753e73fd40de66749a2e434c9da3deccd0ce -size 682642 +oid sha256:f2f4f32e0509da64e574cddb2e9f65e0186ef7cb159ffa45cd795a72ed177d45 +size 670260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 16bd57249a..6a7d27032c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4517e81fe2c0d31dc6d7218ea35f9883d10cb4c5248227ceff7c5f3efe21cdd -size 597485 +oid sha256:c341cd9143971fa723efb88a9c4c8ce95bcea32b0f3334d25e099bda8cbf47fd +size 590429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2f72cf7526..680c76a841 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb82e1011b1dbf4d0efbac907e4acec0bd3f5ed0253525e04c6802df8acb708f -size 719456 +oid sha256:23787bc491e96739b7b513399bce206c85eacaea0c3551297861e108d341658a +size 710576 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 389d02d37c..28cf3f04ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb5c2e05b08f81ee085b33a68cd1e35dc9adfe179d782c9c180832fc5b3cef55 -size 638590 +oid sha256:7ba8c35908d2e6aa0538d9d1778e5ab07e06f3fe342699a4c6c9e4f442321a2c +size 631782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 99e69ca14c..2202254320 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa13d918d3a60879466f5e0f1eaada00b4667f2ec0ab6d20962fcebec7efd20b -size 745922 +oid sha256:07f7aa506de8c7ce765bdfcbf9dd6bf1f6581a9e5915ae65968d456322ffcdcc +size 734378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 212467d752..4051e89c46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4528ca2e5d78412a86b3e08f42bd033d108c3d382ef6a3fa1e18e012bed6a236 -size 663872 +oid sha256:64a4b187d12c91a582c8a1d6091a6048ece01f93c8b640fa5c918d231c63d480 +size 657558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8e2346d0ed..16873961b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a6a4ad8ac5d275b6550653ae0196a016bd5e72afc7b3e22f0d79aaf4f32def3 -size 734818 +oid sha256:6bad5fdfe1f7b17fc810147dda7c7c992b1d2c6597ad672c945d855a6712f29a +size 722188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 884ff01fb3..1e9dbeff8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:662e71d14f7cf83fcffd18a83ae2ff026945b78f8e7ac8c118a33a7c5a8e2c6e -size 649906 +oid sha256:61ce67485b9cea08c662b09af3aaec3f36f3aab6885ea3c158d9a0974b893a30 +size 642014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 54e84eaf15..db6ca0e0ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d209e20c3ee7f9922fc5a89cb98dae4c0514e67886d29d0ba78e44fc710e4407 -size 761974 +oid sha256:1babd3d448b561cca659a970cad5eceaf7b1ecae33d760852c23d623452c626b +size 745990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 0830a649fa..bbfcc2233a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a6315cdcab1643658de3134641d789fa35fad2deb2c375d90d382685d0ca23f -size 673068 +oid sha256:75097c1da8dd8f56d9fc09bd3ee7b912dfe89b3da540add2a929c09eb8c87077 +size 663646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 76c08927a8..c8b3b246d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:363b702974f30c3034370d0bc0198f083190f6358cd1f26136b1f90822d89e31 -size 738564 +oid sha256:0862d8a61092fff3315018975b1f06553b134716082ee6bee85a924df28e5f53 +size 728008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 92eebdaea2..7c21b724ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f622b761a2c1c103fbe960a38ce7c91afefc9a9c3cbcc7181f0220cefecd26c9 -size 654788 +oid sha256:92604460a80a241d3fe4e00e047f81e62686991cf2181930461b948d0a623b8e +size 646600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 385bd5d830..106520c602 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd5ecd06a5d90e59c91f118087f7aaed4d2d056f68b399dad0a77431ec62c65e -size 765870 +oid sha256:162e6f908c79b1bfc7dcef9816cab3ca5d2acfa4f94cf1080d1f36a4d249e432 +size 752106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 0e7b218b99..ed0e1fa29c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:518b8e8f4626bf7cc573f381c92871daab1f0b2701bb531861bb4d94960347ff -size 680268 +oid sha256:1165841b86be91642bd4b9c7ffab830b47f0770bb760356c7da86a264ec93f50 +size 670846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5a14dc6073..8e0779a22c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ddc2537e3209bc762e3d39a68e5708fa38df9f08ec38bd2ec16fecb52a87ecb -size 805988 +oid sha256:9583be9b407c2afe5ff26d4887a62ae91dd1ebe23bf7b0703b8ea5102ff66922 +size 793260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2383ede2d4..249d3dfdf0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43a1c80b36339550c1a0e2a43e8b5adf4eb3d540858bdc16f0d6cb51f1c2be54 -size 721078 +oid sha256:f02a8d94ac5f2117335edb1c98f41a25998487272da9744afbbba343df90b24d +size 713332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b8c4215ab6..5bbbb77161 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9280aeee19a0c73b4aee8c8cc749ff014f6d7a59f5684cd06d98dace6fdec93c -size 833442 +oid sha256:d7adba5f1ce98d2030960f59d85a39f08358d013b2ea2ce3366d314992ccaaa5 +size 817408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index d4f6b177bb..bc1df38e0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25d1840fa1471e67174be359a3cb3a34a8f4002d4301e18a6600dc9a4367f8f6 -size 746212 +oid sha256:ba81e2b2f8a9fed47379d7cb1849cbbc419b312085a3916794a3fb3160525862 +size 737826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 46c74cc3c2..4dc89df144 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bade0944753a0a71a5d081e795fe9e477d01f6b7c0cd4c63a81a36373e47a28b -size 666836 +oid sha256:46d6d3a4d07ef281cc7ed7ec774f13ff4b6db2e84743967f0d72217ebb77dc5d +size 658548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 61de8d2841..1f498e90f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be14a759072572b7360ff2feba6abcdebd65169f47b56a60334b88847b704f14 -size 586957 +oid sha256:694984a47471c2a552a88d349f1377bcd602f5b2c748e8fce01c23b6554c9b3b +size 581381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f324b6179c..69fd35c281 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c99508127ff24e44fb8c7b25b6be2a93ce9dd7f9ea55b53f8c2e18eb019c0965 -size 694240 +oid sha256:d68303c1f6c40a83c9c5bc0d084b38adcbed830799e9acd54a230574a212a74e +size 682794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c4d691eeae..a212d82bf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7ad3fa07c1ae6753eded285d5207ae4160a6f988c0b73032dec4dd2dee5a31d -size 608835 +oid sha256:2230d9587bd6e64e70f459204a8a108679ab1e23f3f71c9b2bcb656de364592f +size 604099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 58a8e85fe3..f2af0358e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8d321c40233f35e6f8bc4b878d9f53f47b83efd4849022e44f88b12676196be -size 673346 +oid sha256:5a53765619195c1a038784e097c27f51cb007edee61dfbc82a8e46e5ad097640 +size 664762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 30bfd5c16f..22bffd0c3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ed0bd84c775c1c8000bc0c6952e5db429728543c5f63545098fe0cdbd8e6783 -size 592479 +oid sha256:5bc368408d83415f742a4534a00a2164fa7e2fe834d75615424ca9aa0f90bb83 +size 586707 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6b555fda42..e4cdfde3cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd4e7b823ca1a800139d0682eb4b88afc853ec93c22f1cb1c4e76925edb2aac0 -size 699960 +oid sha256:1367db6af2e668920b9900e978760fd1b8a06ca3c233b8e056f22d49ed159c19 +size 689206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 9b9267e756..9900151cd9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c8996ebbe6e5791ac836f67a3fd35a4b3a24c8a85a43ea9d0f63413d1a1e8fb -size 616973 +oid sha256:fc07177ea6b66c9d28b8698046beffcd121258c9aa1c201220964c6bbaef0014 +size 609967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4c02b3beec..3a1aa00421 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:847d2dbaa3624ed1cf3f8b372d51732381efa0f5ea0b25ac3e1d2880a9611cb7 -size 736774 +oid sha256:dd4ee8e643510d6d87611b970f7ce8cbad8ee564f15ca0021af1370830ba87d5 +size 727942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index acb40ec39d..64149a31b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be419661de85e05090c9efd9c0aa927bb6b9ffc3f57332b15b14d2086a2b2989 -size 657290 +oid sha256:5e3f3375dcdf5a591cc79ec9c52d2e1dcd55fa9919d40b4a4a0c5140a89f4a62 +size 650530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7ab60eb9ba..8459fd2f75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b36fd95fd390d1874e6c42bd05a977df7c637712bd301a58b5dee19dbe9a9237 -size 763240 +oid sha256:6227d210bc4e1e37c3764b709aed91e73bc43af3b5acaabfb0922dabeff22729 +size 752534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 477b333d68..5a8280a25e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c30cc4c1dc50c856505d9acb32853a2442fab8b151d998603e081a5bb5676c15 -size 682078 +oid sha256:611b3a3165ed745324085c06b22fe6166282727a52df5f7420cf0dd2520c2dea +size 675024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index cdd5500fd5..b01f1dc324 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86b84c5bd6d0c3512b2cdd92fa4395a03e9174a50d7fbbd90408f71f719e72d4 -size 752136 +oid sha256:d1f1dbb0d9983b032fae15cd7f863443dd558985daa5d54d6b91e279a80ed188 +size 739160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 83b71b4009..2f73cfd3b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48433def785845d20451609ef32648c179ef0655666fd9b93405dc6c6995ea59 -size 668902 +oid sha256:cbbebcbfc7861d6c23600315066eb2dcdaa495b076c7f7e2ff9e3df3f3506848 +size 660712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 00fe0842f1..fb93d51084 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a214d48d824be216900aaf58e3fda4567745285bf9d3dbf9d249ce738c2a5dd4 -size 779292 +oid sha256:ef75d6d610dc0400bd13a0a139f11a50145e1d714a531e2870cc72bc6eb36c3b +size 763358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 2a2a34aaa8..0b98ad6e95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df92a4e10ab95c1682da9326d32f60df5db3d83d7f54741658c53457c46ed51b -size 692804 +oid sha256:20359c8be0030e4b0ab7e261b56cd8a7b23c2ec8d9c617bf5e1245c8272403b3 +size 683972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8822caf3c9..20dce4749b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51528a61292d15cfa2011b200a77585500d9d22d977765160ac0a333b400879f -size 756672 +oid sha256:814fc927d0837d7a3b90664a0793a41aa356fbb0fe3527146504280776b57d37 +size 745326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index eb27e20eb8..2293d76017 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f1eb3078be9f2465abdfa26b75e58e0aa9d12e36c316156046f7ae6c9600082 -size 678866 +oid sha256:112120cd3b52e8484523e3902a347ba8ab2c4435a2d0e10438380d97c510f81c +size 671514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 9d8fec1732..dabc7404a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc5b46aa78fca9be64fa54ac7f6f073c0d5ca1ff531f48c74b1fee9197462ebd -size 783978 +oid sha256:e754681d3c2a1dafa35f325cde6e8f1d6975eed5dff42e9ffacd0fef85b99ae0 +size 771002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 9b65f631d9..092ac5b45b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:418a3830382bbaee15adfb645817d07177da6f24a58d8c1c0c378faa5565b354 -size 698968 +oid sha256:f2858e7246af230ef6f5746b45a3774accec506d5e10bb1d9e89542205e56807 +size 690384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6bbdb8c545..18e9083b87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cfe14ccbd15e623d12f5b3d81739cbbe87b0bbd715546b5bdb59bfa75221fe62 -size 824096 +oid sha256:92da286b614edb9b27a23027e8af14a26ef4e2a9fe320fc010286c7b407b0081 +size 811416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ece1407faf..492fa17787 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e9bf00612ea9319eeb1e39e92c4b06b7639d40a9fea22789d0dd903afc8f4d3 -size 739826 +oid sha256:a8145193e438a92b0ee9b757d37c19998a42ba596255f912d5295dc10af7975f +size 732870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c3529c70f2..e6a450895c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc3c6836ef2ecd965a164cc60dcec9ec9a68757a4b44cb139e752e9cf29c5a28 -size 851548 +oid sha256:ee0f0c5973a16c614e17ae5d27fe039e11ffef91939838c9692ceaab29f33b2f +size 835564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 32f2d68611..0dab515261 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edb0a2aa312a1dc19f2062f3d3643eb0919d227e2f28a2947ea87deec163c915 -size 766588 +oid sha256:06454bac1e851d2627a0af385c8471cb68b62854daeabcf92993283676a43c18 +size 758202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index a0176208fa..0aaa296bb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2ed5a3988f1f2c04d78698373de8cfa754a21172464a705f34801ed77b995e2 -size 666392 +oid sha256:7b57ac6620e7529ebcf028992940217609cdd66c7c87bc3145c32b93eccd8796 +size 657414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 4641bfe5e3..aa2350901f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebd006954d08b6a70c944c986e1cd37abf26f20f422c888cf2ffa9024189b0d1 -size 576547 +oid sha256:cdeda7a3624abc64b0c9080704b6223a619a2e0561e3fab4c4b31b55d4efcc5c +size 570331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a799e6523c..50749c27c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9222bcdf3f9d80318edcd7f45bf970cb05c2b456ffa1af1c97a6b911ed8aeb98 -size 683584 +oid sha256:89234fcff3a0b733a47350a9e4b8ef07fc9f3267ea7f4a4a58e7aa770c79c870 +size 673766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 5bd6d89fc4..c5ab36ce02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c757fdef3df4079952ecfacf1c9407211db136bfc466cef7115dd3802672b0e -size 593641 +oid sha256:ff8b1e8c61b51d24b932515a5a18293ee31e78bc3c7dae3fbd3e789b31555321 +size 586635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 7732312c70..93bfc35a65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b7a55ca4637f6705c36de623475f74718d9d8b84d22625d0a4f17a1010345b1 -size 664712 +oid sha256:c7bbe10c98cfe93fc528ee9a461228d0d783998f5c3319201947d1abf00dae8d +size 656572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 1dd6e199a0..cd437052cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cda3458b4d08bfe358ee288c2e93b04e7ae915f05b644e3576316b3b8eee896 -size 579209 +oid sha256:201073be7bd21298f9896e5fbcdf7eb450c3cc07ce69e2fd22a8b7d3aaed65d3 +size 570773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 931a1bc714..f8c4271fa5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac9bf7db044a57e84c81216c5c8fe73c1cab1f3e354aa97df160c0850766eb27 -size 682744 +oid sha256:5026628e3b61139f91147c0eb2afdc82d24ceed243004a7b822802fc786f9691 +size 673024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 55c4a0d83d..72bfe1ac95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1db2f7d589469f2dd997e8b05ca288f46ba22d5a1924f55b540c6880444a197 -size 596303 +oid sha256:4770cb2157c8112d352874772a57083231559d316a858722da3083709ee33cfd +size 588655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 80046404d2..fa3b82b931 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e58393443a456c5010589b692762f5836774849f2cc24fb876d664e4c4fdc702 -size 733764 +oid sha256:69d299ac2d8849023649b89ce319036515fdc5b4b3555aaa04e4cf0a21f06a02 +size 724834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 2524772a25..4c0f783e4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db3f4e5a0c994f8c200673e57cacb57134cd2323e2ccb08fdfb2a4343bf838ea -size 646584 +oid sha256:1961c039170f7abac6642933f15554ed88e5791cac5777b0aee2f111f8153c03 +size 639678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e411388315..18714ed3ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:011f5bcf4963986b7a32a78bce79eae2d57b68df88ff15159139e7edfc7d36f5 -size 751794 +oid sha256:86d84a121777fa1af22670de82fa6792aec34bf6e6055e2134e7fc99f5cad79b +size 742914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index e2d7f2466e..95ce6a023f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:417f5846825d4251123a07a5e904fcce0bd18b0c50ef4cefcf1ddb58de233757 -size 664418 +oid sha256:b66150f907940af8325ed1233f479bdc8e4d289ce3ca72c232ef217b4d66d6df +size 657510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index bc86cc8836..fbe81374a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e7ca26c424cee8d063fb76eded89b78a7bf66c01799fde2de2eff698cb8c829 -size 754898 +oid sha256:534876561e1c9419e0d08a783a3df114c8e43648fefcec39ba4773a054aff0e5 +size 743256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 85162d12dc..8c1636be66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cc507bf7c11f16fb30751ac69bc40460f424697b5a81cddbf48044528368bce -size 665252 +oid sha256:bb514e46b90539cbd3f98a21fd05618845a4d7fd9b6cffc37252825287195012 +size 658444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3f96ec00fa..c653871875 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17667afbc7714c0dca72bedeef2128ae911e33e573cbd757bd168fd95ebb58f8 -size 772238 +oid sha256:71e994e52056ed047eef82a3ac5cba6769fad756f025615938ddc80b852bf72c +size 759708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index b26ef9bfd3..b166a8ee5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:873ec01f6da3f162c754af453e1eda765a83b124b43bfc9043e9ebee13dfca38 -size 682986 +oid sha256:3b49cddf2f7b43c7020154757b0878e7512270f0257115e588849c26e11f11a5 +size 674600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dc6fb0f4ef..d8fc3e03bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f73c8a1ff5ee9b40737598661a9ef1ba12772913091ba5c6a3f0c4cad5750c2f -size 755094 +oid sha256:32c1dda3259d52e8da4a83efbd3654d20369380d3cf8fa7336c6ff1ae66be069 +size 743500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0d8a4e5b8d..731feda9e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20836e4e42a5a539d6a307c86201a0cb94b3daef0fbf60c82d13a1b477c601d6 -size 670182 +oid sha256:2b51537640b2175613ed9f759c15776adab894f2dca6f3a91c74511eccb77c2e +size 658638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7adb9fec2a..d760281fa0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d936dfe497b8c72371dde5b9884f5c5f3c6d1b667bf73aad79f7b37499ca215 -size 776972 +oid sha256:72ece2bbfb360065146f5316a73443502d6b60e52210ee96f78a9eb0d73a8daa +size 760692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 6531ced5a8..996b6fa5a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a4221feffcf6cf157c7e64b94bc78e3b861ba82f0afb85e0cea37824c61a042 -size 687276 +oid sha256:4d70447753387f8a668a197ee3b3efddc56826db7bb0cdf7d8fd9ed0e5030737 +size 675830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e938654440..592917e97b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb2ef5a1759ea0b429b1ee123ddfe083e995c614bf41b25b7ae9d4293b114a6f -size 826512 +oid sha256:29aea1b335871f6fad54b0654a137675b158f19942bb6032183c1d3f3ef362d0 +size 814574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5d0b6a9430..ad6c84f030 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2b5f860b5bc6a43c6b43a2efe519b54d914e6963b9343eb03827854eec7e7f0 -size 736520 +oid sha256:b3e52be774b5e4573f715c88aded5673f526f57e615a6f68000b2f9c8b5bbc80 +size 728924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 46cad46439..049f864568 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4b588085f9466689257eca7ed2645dcb040a878d788151536cabaea99fbf457 -size 841040 +oid sha256:3aacd3fe37cf6952d1ad569a63b019b7c33b2a8ea983ab2bdee45bb5befabac5 +size 830286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 5ab8bf40b3..71c16e4d9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd567a1b872033b60bd10dd0b3f3ed7c594e0f1021f6641e76d6254645ee2f3b -size 754206 +oid sha256:50709a71a67990dd4799f8a22f97bd009449350952b4fff01dbdc45ed3db7057 +size 746806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 58c98acfba..5eed70a60c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dfeb92df125475659d7aab8b47a2ccfda556106011f249c0f196b19fad6cf16 -size 658448 +oid sha256:bbb9c43f229b01831c6c4b51b3481a451df5f01d077a0dedb8217cf5e5b47143 +size 648580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 44dbc7a8f4..28bee8ee7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf361b0a5b42092637d9c5f89e9d4a8b51ddfddbd1a13e1282d9a8f97457431f -size 575607 +oid sha256:dce1c51212d05462ee355dca2288f38cf238acd02bc50152316e7db3b19171fb +size 569983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index df4dbfa4fe..10efb0cf40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c869ae3743fa6072514fa90ed66b735748bd57beff80c94b2daee123b3bab99 -size 684322 +oid sha256:f5e275b52e3f54561455b0635c7be67d0b6edb81d86f5ffe2d1b260ff2e3f76f +size 673568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 5bec35b31b..39227a27a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:084256e1c5014b914b794183f7d43f44bef0ff83bee313ffb8d9e25fdc1eb49d -size 600347 +oid sha256:ec0885d9b6fe0609ca3d7d51066f26fadfaea42feb1c55476db1db380f113924 +size 593243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 27de72c975..041ac25df8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32d91e85dc63dd06c3e5333b2e973d6b2592bbe385ba86399b6876586f299b87 -size 663378 +oid sha256:7a5edac38967aa661446ecb93b99e91148bfdc228222dbd7b6bffac9695d0850 +size 655534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 0b67ac44b1..137d3937ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfd065406176eead9d90192adb54fbd7c16da27d7adaea38a59dfce1d3ff73f0 -size 581969 +oid sha256:65edcae5093cde8778713da760a126e17e8dbd7206cc9be1be59b584c815c8d7 +size 575359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c466486292..2948a6bfc4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6eb788b85b63fdf9286ac5ac0626cfcf587c05fe865343bd6d5d7dbca9a19711 -size 690832 +oid sha256:8fb25e4a113e315cb68830005098bdc639168bee2284a858b7b1b361afe9e3f8 +size 679238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 794cc75e9e..eb979956fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa261d1ae1802034997d6f603aeff776d989a0928d5b412f5e2d557aa9424527 -size 605625 +oid sha256:d23f1a18b9826d3a384130ef8ca62bd0fc9aa0fbd08d688b40268c6218a970f9 +size 599409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 14f5c9ea23..9788f0b98d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0497d004ef60f025feadf7ac9adc903c21b9ca04f8a16792fe3bae7ef7ae243f -size 727644 +oid sha256:610ba9972c801d97cd53ae94dc9e943724b9703e316e32ce804f00af5f3d3124 +size 718764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 2633a28d49..c01a8a15a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c8d080fa2c7f42d301c2775ffaf15085702249a715931f4f7cd74d46087d2a2 -size 646780 +oid sha256:c53cd61dde170f0753c1c8113cee04dd6b50255d3e28e3fd14d390774fb086af +size 639922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 0d96f599f2..c8a0ffa704 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b38a45042ff8894652762dd6d8c8c44b768bbff8baba66a33b5cdea59f7b471 -size 754062 +oid sha256:d52d69861fe4df36d443948f440494bcb721f24c210434d219ae473fa446916a +size 742568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index a4e2feea66..e95d8b4965 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba56720d6879653ac28b3664eab01fda81c6336612a632a9f532287163f9bcb2 -size 672012 +oid sha256:991a816dff073cb2d29ad045a1b761d64d52e62cc6096599e7898ab52d6d9cf0 +size 665698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5e45c8fb58..f5699f9f63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97667a82c967e8394c179157113a05b15b7aa18d5f7ef6e4127156c5e488ce0b -size 743746 +oid sha256:5b293c14a138508312143151ab8a23f0aeaff193dfdf6d51be09ae71a8204985 +size 730378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7a42f15263..5a3561bf5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:461e26d7eef855903d4701cfb53307866f53e98f732a1a1025cfb5714e2a5fc5 -size 659132 +oid sha256:c6950f3e5e7caa4cb30b3a43fe75774fe6ec46c646a98289470836fe5ce6b6af +size 650154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index aececf8769..18850d72e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a396ef3bdb691fefee96b30c65e12c4295ff44e9a24fe6c868034d9b7a692ab -size 770164 +oid sha256:956ed7b9d598f6ef12a60217ee13d2280940767179f10918a8c6e1ae251c85d5 +size 754920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 79dc461041..3bbfeb78a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5412d6e847e46b253221b3ce66740a9d4dce84bd874ed7a83e11e7053cf8cc73 -size 681454 +oid sha256:f6b107ea8a1396c1acfe3d0d3f8d3f6a6dad0233e4fabacd12efa97af0bcd439 +size 672624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b27fbca030..d7ab4abe18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00a8d24df08eb567015eba6c3caa59fb42a93a24ddf087fdb644839f2d7755fe -size 746704 +oid sha256:abcde150eb85fabe82998e66611a340f0262d1105b06cd57ae41a25f62cb050c +size 736148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 9908e60bd0..70aeac4190 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd2fbddcf1dd06ee48c0e05d74df4bd3a7b251ce2c11e1cb227cae6e0f8f0934 -size 662978 +oid sha256:713dadfbf907aa7f13845b49a5899bc02f0dd2c4e8bb42ac54b648b84eb4f9c1 +size 654740 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index badf5ff3f3..e54a68ad20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e7ac59dcb11573b114e3d5328049a18f7736bf214f479a2ecc440b324aa8165 -size 774010 +oid sha256:101f11c030ebf1225789f57784a5688b2fbab0beef55715c5fc24ec7d731bb86 +size 761036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 1320a48a5f..9efdb08160 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e171cca28c6ef054870d135d4022b1807040de0124b49be791d5cfffa353969d -size 688408 +oid sha256:b9b4f71528e85e5c2d8dc66a9ffed9601eb92acde6cfa6ed5bc6a1ad1d35f86d +size 678986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3b943d2092..41d1dfe721 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d807a408307ba36263d0c33c2529449f0e5271bd35c72b04ef57a620f2d633bc -size 814128 +oid sha256:4d8da346011011b8b19e1add6d30a8aa517345f5ebd4939c7e0e1a0b4bdedd43 +size 802190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3c4105d98a..fad60e7230 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d33773ed1edf1b8af5a8474687abdb7f2ea22b8a3c28bb903e3bf2570385f45d -size 729266 +oid sha256:2db4e84a2a6de65666f82d6de43238d5019aefd96a869a9c6caa0ad6857daf2a +size 722310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d03b9111c8..05dac54eeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18b69602265759b1544581648af683f3eb8f07e3fc6bc80ab841da8b16e8f5d9 -size 841582 +oid sha256:706c93ecb2d17aadc33f7d7e150945100d8e4036912a370bcc0f5c6c4699a731 +size 825598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 7a0b7ff0dc..f0f1e8f3a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:134695e309496541df613f365bb684b80640029f2df542344aa7ba8499c7bf40 -size 755190 +oid sha256:7d4f242a64d52599b1d1ad642dceb35c9868acad999e0033a77dccd91b82fb1c +size 745966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0c1e3e64ac..818238ab32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81ce564dc0678ab87dfb6f1e31e66028f20f26f48c687512bcbd9e33f71ff56c -size 733238 +oid sha256:1fd94867cb9b7c7701f7929eb3ea9d7ef997c88368a96bbeea1d754a4482fd47 +size 719228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 8ac865df71..1e05be53bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8ba324ca615df8806ceef117afa51d393cc7cc8f756eb4b47a4eece217fd78f -size 646946 +oid sha256:f066d6f479e2ca23947ca08b3f1909b75d3c4404ab182d4abbf98f14c222ecea +size 639054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 69e5d6b60b..ad07c611b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4ba765e952083cdb0c1f9bdb41c5e68f191dd49eef91008f7ba752420484e19 -size 760988 +oid sha256:c6e26d49d7406bc3d4668fefb1f1eb13d3a843a82765268f80ec6a9170a8cb97 +size 745892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index ce8100ad2a..7f85f76e2e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07dee9af63bece3b8e11f3ed76a90f68fcef9487c788c592f93f38fc4b8f5dcb -size 673314 +oid sha256:732405fb368bd2c182dc3ec73b72429cc6603492c2e12af5a80a503e333c5d68 +size 665224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index b63ee1c7a8..8da923158b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42fcda64d10fd4b734f95f0bd87c4a2d3a32acfbb615479d55487f3c0b556d8d -size 737578 +oid sha256:1da389c7cf82fac88e6c972f457517bbdc4b74602ce2a85dd10e55c5c6aca9e7 +size 724948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 41a22d05f0..61963744eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00b61b5ce7cc59ce679e6621252bda9cfb9afdcbb5f16ee1a0766a7c6c513228 -size 652568 +oid sha256:f0bdb5b3aebba3592f35c9a1fd49c27c0f2632af026af518e5ebcd8884892bf2 +size 645218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index cb75d24979..11dcefc25b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9beafd81313a3620e802ff7f4a384eecaca0eea28f3df63a9232b25cc7eea576 -size 766808 +oid sha256:5da777e5d6ff754bfb1165e65b725d216442c22ae219f328cfa153bf9b5d64d1 +size 752156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index d8e480592b..272b73e863 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39e16d373caf9478210cf396b8ab3c19912d20b2653bb35395aaa6ddea9a80e1 -size 678986 +oid sha256:c799c82f808cf4b01a2073ca08789647cc8b630909fac1a831ba73801ac51db4 +size 671586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 846013b828..07b2bbc099 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0763028ed04f3f6cc61411e42e58683e32b1cc65759af0e60a29e140031e0d35 -size 801992 +oid sha256:a8775b113c14f4fccdb714d8d5dd12b3be681265c9d0358b62c3502c6530d2e5 +size 789214 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 2d093ebd86..3917536207 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bdb24e2df5dd600155452336d3a93f335aef0a660d82b71fce37b21872a3368b -size 716490 +oid sha256:a0df0e524aa8a1f6f47272b25de69324fa388d709fd6ab65434856cafd34e92c +size 709090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8173b3d979..c4d43d6326 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6b1fa091d02753e0525005bae5668069b048791ccf639dd4ebcd4ebd3b5e961 -size 829988 +oid sha256:dcaf1f17abf7b108b2667114113667f8498e0d724c3c41ea1e1c05df4e9931bf +size 815830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 3443069bba..a057cd6be8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac73cabd7cdc9353dd47b8340c47d9a0b67aca94aae994ba4ad8bfe8b9391a0e -size 746754 +oid sha256:243de91ab376f15335926344e86dc0b3eaba4c3ff08f9487c981cfde9665e410 +size 738862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8952268f05..2bd3abbefc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67e7ca44a29287fcbdfff35b2644ac3da1b7f7b448cf71316adbdc3a2f0882dd -size 833338 +oid sha256:6f4fde90b4bd2f0bc35faed7ffcf11e46de82abf0c42fda74e0a68027a3f12c5 +size 819180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 08c7e9e663..94bf382974 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de461b473ee4177cddb0ef346a569761894aebda7574f0275a967fdb4b9e2cf4 -size 740336 +oid sha256:663ebfdb90c2225d0ce43a5f36178b2517b1b01ba1cb2939bb7dbc96b4d2da72 +size 728152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index cc15f03a59..a5f59efdb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee98f428bbec987886d1d4f9fe003a7bec936abd7a7b9c9a4e18c0d22af4bdf6 -size 856746 +oid sha256:28ee8482b4717029378803d8b8264b91eeb2b0e123c3eeb606f89bc85853aaa5 +size 840664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 1df7c28040..68334d2a03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:162666f28e08723a07795065811a60eb41893e891ba720d119ab9ca69a30b61b -size 766804 +oid sha256:dea0566492f4935b16bfd7d2bd7e65509c21f8885752d717eb8a03873c912b56 +size 756592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0c7c7a397d..eb23791e73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c73ae027f6fa57da513a24da51eff5b238f1cf7d56060160cb047876fa6f5d31 -size 836494 +oid sha256:836f41f55a0ee8776527976ef1627a8f427289df527d90e2bc9679199c5a51d4 +size 824012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3987cc12ed..49d551450b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a2947708f5e3f9d4b0de726fdd0427abac2be9fca58eec41517add21cb30df7 -size 747094 +oid sha256:567a6646f71be58171ace5339b6aa92cb62fc8f5f2836cd7438d3a1a3425149a +size 735154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index faeb3196ab..78656c4512 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3a69c4cffe44f4b631c4c5585e5024a7a9b82871079b5f5006f59bcc599d751 -size 861382 +oid sha256:8d07b56fc4261293a7931caa8d285f88930a4fa22c03df930d6afb60bc79272b +size 847518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index c55aacd8fe..0d5c778833 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98cd9cfde30d15470d708f06fbcc527a57c1fb17281e36886c043d0789c07541 -size 772030 +oid sha256:ed005cd56f504a1e7eb42cdfb1e708bb9730aa4b4f6db54a484512fc1d0ecdf1 +size 760734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f8fb8e9b5a..d068fe537f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:893e6b2df728a55a7e48074dce893ed487c9159b41b985b6b180a9e66e1470e5 -size 904360 +oid sha256:4910f9faf05af7f2b97bba43bbe1b4b05ea111297f9a88c3c2009783add992a8 +size 890596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index fceb657042..eefd8ca436 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06e6cae963c40d34412661b1c1c11aa17c4294b4aa03901274a579135a2516ba -size 814616 +oid sha256:d24d9721828eb66eac6f927357db7069ce37f2ae7cc2ee9db7549042aee11efc +size 801690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 21cfe51cb6..0e8388b724 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:932145bc9c9a45e4bbe17ad334c7dd74e688a2f01190035c4e60631c28f9513c -size 928952 +oid sha256:01d232c3d5788c38b45f53f6e41fa606abf0358f4a6fb8ecfb8df769376a12a9 +size 912870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index bb8c42e856..0511e81b67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05e1a239d13bfaa9d51ca3a7587dc63e02bf2c7668eb1af7a2c5c7d5b08e3974 -size 840144 +oid sha256:b599e84886d2a00e40fd3c3b30b850e0677987a29407d9a4a2103f095f3a9913 +size 829192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 68c4a9397b..ae074ebef0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2dd5190fc513950e12fe4ed76e37b82390381e200c708f9d9ee893f6c9300f4d -size 695992 +oid sha256:d833d9078e0836d50ade8813e4845e5c17a2340acb980b7f84df1817ab5533be +size 682376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 0e03c7d5d6..d76df2820f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bbf529657f781848c146d5b5fce8063b3630251fe55a5d369d38b4b259e3d2c -size 593813 +oid sha256:1bfabb6c1fff135ab9ab83d4b998e83dbf0e217434f175ca03bc19ecd0420481 +size 586809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 51e356d983..65c6526906 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01523af3cc9e9b099e08330885ddea97001596e3eded7400e49288223c67de51 -size 716736 +oid sha256:419bcb9ae8543375fb5e6f9f476aa6957d70ce74dc0be086f96dd381862cae8f +size 702922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index ab3d082c1e..efa2957f9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b8c0de02ab535c7fa2ff799bc32a0f765f441a278cf3fc660487e8d70fc714b -size 614163 +oid sha256:0ac98d77dd321c9b77685531895d624981f8d4cc65feae6817c72a9b29acec33 +size 607749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 23576a3373..3e4873075d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76df2711ddf0aea53c71679dcc65a2d95d34786937facd7be5c840ac68855832 -size 691156 +oid sha256:f4f84503a277c9598696fcebc6a83bd8748fd84c2d4f81e86b540c30f950e544 +size 682276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index e76fdba711..d1597256e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90b77291dd7965d5669ca117be88de8c17f4300c8bae0547ab7d41a58c2843cf -size 610387 +oid sha256:2f843621d2d1c22c9e0c285482c65bcd1d2e674c1f7ab00f7375cff0376f75b4 +size 601951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8d7994da4e..5f2309f799 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dec6af08603cab35d21370479e07d3d6d44bd3f93774f4b3f8f12fd1be1c3c1b -size 710320 +oid sha256:7941451e62fc3ebbb4cd0f5ca2235981494e0112c3603d8acd21216ff5db88eb +size 701194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index f64430156e..42ef610832 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63fbb908f051f9d01f637428c3ef30ef419e8f93011448e4d10416b5bafa1b0b -size 629406 +oid sha256:b3d8ede638edbb16d8d8d05646675519465ef7f2af94c4657a3e31686478d196 +size 620724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3b4b5215e6..0f94effd3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43de90d76b6065497c0ea838cbfe51d787c7dbd0c3e8f30329f049aab9b19967 -size 761538 +oid sha256:b30c5838a2d2765b50e906d206695fc49f37157e97ab90ff454dba6011189cf8 +size 752166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 13df2557c4..1fce82d99d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7934fb7d8c2a88d3068210333f327d11da26ef1a2032d7c1753500d32e3cb63 -size 665430 +oid sha256:bd676913cd7e3aae2092745f1158c6d4b8b586bb964b26e2a49b043e77c927cd +size 657536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3d79515f09..8524fa0727 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a35b536747b0a574be21365c3dcc4456bf1921c2ae6c81ba85fffe9add887a02 -size 783122 +oid sha256:ec64d688594bc53c80ee1ae119180dd32c2d29f13b54e1d60b5dd67a919eda0f +size 774242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 4b61236b81..8b0a77c6b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:728fe89413d28fd5cb7efcfb6e2b76c96782a83a03a711a62987cf8bd8aab897 -size 684842 +oid sha256:772e1d618f76c17758ccdb7809181c310e0d89096fcd69889846d8780651c816 +size 677688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 16335cf93c..f40d30159c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b846492e417c782b18cdd9e06b0ae114e8f9be26aced56b72cd1ee55355a7602 -size 802900 +oid sha256:91294c8045f5b0449452ba8d67d90160bf3ffa4065a08eee465e1a1453aa9462 +size 792292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 09e7f3f0aa..b468c204e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b881d3cae4bb4c9ed78f7c3545826a00b808c021b475c11f36f55d52f917e2b -size 702300 +oid sha256:e5d3409586c6a37b077771107e4b52f8361a8e080027b0df92afe2e3f786b1c3 +size 695492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 766198fbc6..992c17fcf6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13b0f6395aca8259eeb5811bc989499b0424890c380e6e87323c009a6ee3c386 -size 822756 +oid sha256:ea52bb7be7d07353ab38a4f5bb103cbcb95c9ef6bd411655059a82a19fc6fae2 +size 812296 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 4bedd9d92b..34fdcc5c8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db6319678cf54e7c677b9bac51387f6f8954d699014144e50076997a7b7eeec1 -size 725364 +oid sha256:da35536bc11625f64de2eb1ce69f35a58282c460fd97e13551c2a93519246bf5 +size 718408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4ed73436d0..bfe4914004 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bfaf5f96938c8cb018221e15375d3193b93ddd899111d55a32f5532987884d8 -size 802946 +oid sha256:60e5dbb43672d58b1087109c69ef3911dce3fc9ddcef7c15b77789517b951ff2 +size 791354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 280900e891..73d903ebe0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5feedbad8be24b7c61d74ec639ee85ef003b4a60ba45df41b201d506fbc5ad2 -size 720404 +oid sha256:b186d5fffe0bcb47db4cfcd9061ba2d169f3b223de09b0d69a006b6834afe718 +size 709798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3a4f711e98..252a2807a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86eb0810ea71e5acfa84774092bf916d071eb6ecd5c045e206608cc0b23efd93 -size 825812 +oid sha256:2f627ec0a80152f0c4ca32a126b53dc93b322cf634e66e5a1c18b603f4fe863f +size 812442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index d5a7372a3e..37c23daa67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53bfb00de353e0b44ea9806340c13db01a8c809e8405a82157f4c48af7a82acd -size 739422 +oid sha256:0de57212cd579a1bc745c977cbe059193e397a4d8887e6bf4e9fdc2d33f28b3d +size 728518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f73664c062..dd8e7cd5ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cae30918327aff8788b2febbbf4d3bfff250281fd2158905ee24f4e120b44489 -size 874564 +oid sha256:d1cca6b66e90bb6fdd6fb31973141063ea87fa35edd9310485ab4680ecb91e9b +size 862328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1d548e0648..0544f22237 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa397fb3b92ea7a4bd8e11b1d85b53e2ddfa3729484fe2a79347104838c3fd2e -size 773076 +oid sha256:e112cf183109f201b77e3a3bdb8dcf5071f2a9c9388087c8aedea234b9ae39ce +size 766712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3c32a18a31..20dda5f34b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8462f1a9aafee2946425c9af9a22976404aef341f85094485052062bef1be82 -size 896244 +oid sha256:3b74632ead990ea4abba830d4bfa833c89f9e81a44cfe4980849aa0ecd2c4042 +size 884404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 12abb9959a..15946628e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42a3cc1b6f2239b24ce9ef09c0b7c7a2ebccc490f694292796cd6c291f8662c3 -size 794314 +oid sha256:d380ca70294682ad7adb2102542b0cc7e7d04a1a6c15cabdf103d198b81835fc +size 786964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 46299777ff..35105fa9d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc59ed020d3947faf24f39ea07fd1d6f78d2193adb407b170965ad962b23e340 -size 691452 +oid sha256:dfaf685e88412160a9ff40b9d3303bb3b690c9080a94250dae0cc95c9338a745 +size 677836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5145243b11..5d87aab53a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:732ae98730355174d0a9f7c713da5f9f0ca01a090df79875c363bfeb12ccc2f8 -size 606391 +oid sha256:70b49ea27c63a177ec610d040becea7489c39c6b5420eb604ac9b9f3be874f47 +size 598695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 383755c60a..65a98619d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a14eee5a9f77460804648094743f5685403d4317073d17eef38f9f419bd86291 -size 732126 +oid sha256:3c8b38ca3a3add932018999e5184bc3661f502fc7672f97c781372de85c93e7c +size 721568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 32e4521bc6..8b695f17ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a16c57759b0c7ac1685c73c920fe5fe2851721c35517e0bdbffb11f558a5bf0 -size 651310 +oid sha256:9c68850eed3d2110fd6ca292a745a29b597dbee22bf4da008031352e164e89e7 +size 642380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 0399b6724d..80336ea4f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46708f22ef7d55a403409308d08fe20072a4ae52ac0f6af4f146e9d01f742468 -size 691498 +oid sha256:cae509ed6b1fa603a9d7af722ab6b94051a289a8f779b83a0916b84bcbb3b0df +size 682718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index b6976a337f..0398b9ebe4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28c68ae120ac82e0fab7671d5db74eca42f021de506cb686a72ca3b3e540dbe4 -size 612261 +oid sha256:b7486faf6c8c4e8e1f09c6b787e75f6dfb68a81e8ec14de96ab467981ed0501f +size 603775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 0efdfe9c43..bb13c715e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f3f2ac1fe48bb10785d0b353fb22af9ae42a36373f517553cd960aa11944feb -size 738192 +oid sha256:5ffd145dc92a7ff9c75dbec2c2298a3e01367e9038e7d79d2ca11c56056dc8b2 +size 727980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 19189ad351..93274cc7a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:164d24c448e898311436d2ed4e0b8adbcfc6db98126b3c9ed6c9d3166a9b8c4f -size 656192 +oid sha256:6f0da81cabc5b4cf3c4dfad9f88bab98bb8bfd2f3cdfbdd77306dc3e3f6ca3ae +size 647706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4213e0964a..e4168911c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e5e53fa3268cbf14b1e28072a59ab7e6fabc1f5ef6850e158d7075f09775fb1 -size 759464 +oid sha256:0b779d886bfe8d2d4b662a7ae6b4f61ab4bcd9378a0d2d0ea442afd1b5c60f5f +size 746046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 031bbf7035..35ab2fe115 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5abcd999f7f45df3448906ef811337425011779845da1b4a6587fe60d07216ad -size 677612 +oid sha256:3764b6478e9ee166a5d57335cab4c1c52ffbf0fa58eb569a68af79f0e3eaeb4f +size 670114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 5ee12a0b09..2c7217bf55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bddf7f9f4b8122ae82d42084698acaabc1053015130648c6bcbd1430d166d448 -size 801866 +oid sha256:e2f98d3333c78c11bc7bdb20885e1a3ae90c0b6288296b759b176eb865bd0503 +size 790568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index cfe4c7f706..c343cef124 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6638ecb395736644524822bfcaafe16745f5e41ee0224da1cd3ab0eec6984380 -size 723516 +oid sha256:4612339892d3ea8efdc743b57f31f3a70e1d6ef703b06adb95480d02b87cd4a2 +size 714736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b57e8476d5..0bb37f3fd2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41765b6dbe6e651e2e84aa4de093efb9df32e3ee42b641c6f623ab2e99a776c6 -size 777194 +oid sha256:f6362aa29aef7d14d586feb1e22a10932c373ba3128f8d4c90dec3dfc9a43f95 +size 765848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index bdddd598c1..beaad44967 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35ab543a25744ec5f08de4006dc159647612eed61ceb8f573541d636c16c868d -size 686414 +oid sha256:4f12168ab730d1dfd779ba2f30748e961058b51d460e88a7e11b24004c86f5cf +size 678422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7f3d23650f..d134ee4296 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1eade040dd289cde0a071eba2a3f11ee80caeb9ade5c9a999c2da26c015016f2 -size 816882 +oid sha256:82176366c46af0967c14172ff0cd88f33dfde6bc445e98f678a1111718082a3e +size 805190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index 048d454ca6..3473b330d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4af63ce1c9bb4bceb44a082e0f535fe8063c749aec3e9d3e0d5250c1909adb1 -size 731330 +oid sha256:8425d7d158e62f3028284ff0bab687ce5448189ab9a10f46b0d9faba445be068 +size 721266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9668cd2232..847166101c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dde70c68c51ed00e713f90c70abb173b4e6d4ac0257781217d10d859c081d0c8 -size 775960 +oid sha256:c16dc48b1ed7de73a518e412e71ce942b72a95971357c59f38a5a611f5b60ce2 +size 765550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index eabe3265bb..e1eafd91e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc14203270413a7ae405d56200c07082f4bea8241af9ab4cafc6a61c9377110b -size 693220 +oid sha256:34d658f16f44c6125512b5216ff2c3d9fb7fcb2e7ef5a1846409f81d36bdb1e6 +size 683402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 5f0802e5ba..e3a1aab468 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df173da9728b07346fea5bd1066b9b55fa14de71883ee9df06e5f78595edf7c1 -size 822110 +oid sha256:15a58250756a7677497fe0b2f170e8a230c3c47a55b8a8ff20d15dec00c6aca3 +size 810172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index b4cd8d19de..a669a92483 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71c11db90f0f671995c246bc6aa8398aca4f3cfb7a484abfc9ba8efaf0bb82ae -size 737496 +oid sha256:425be9c762486cb22377ed791817cc39531fef6399d92cd262b2b8a92cc767e7 +size 727628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0342cd4cf1..2aaedb8cf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e4f9e26b093add490eb2664ced021432d85a168913bd30db3222b5d3015f6c4 -size 848414 +oid sha256:daabe1ceb14e839e9dc0e699c74152f79fe33a00037bf85f24ed2a87bc3021e1 +size 835736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d955ecd6aa..9f4a0a6961 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2d45b592860caf8590eba54fa0a49815f812993b300d10d2eb02813f1032ca7 -size 759558 +oid sha256:200e784c1b30e63c5053e9b00ac71d0980274538be55878dc17b77c2ca16fd8b +size 751122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d276374864..66d52e67a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f85724c4804c5f32d2655c1b2ce8da478ad1d2950e6af4daa3ff6a548e519bf -size 888350 +oid sha256:978341b9ad61efca8a0d8bfa4c4165d84316144ddb7f1a93f1ff545f58d8c016 +size 874092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp index ebe87e0450..c6d2b73f85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9fdf5631add49f52d3a28847dfdb16ec40aa79800969c5c7aa8fe3a0a6c5be39 -size 804622 +oid sha256:e4e05376f9078a2a666826bf38531bf5d109c89a4c51a44d8ef2268295ab07ed +size 794954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h index fab7286a1f..11007a067b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h @@ -25,7 +25,7 @@ namespace kernels { // clang-format off -#define TLLM_GEN_VERSION "851ffd49" +#define TLLM_GEN_VERSION "b3c16468" #ifndef EXCLUDE_SM_100 extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; @@ -93,1796 +93,75 @@ extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunked extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +#endif // EXCLUDE_SM_100 + +#ifndef EXCLUDE_SM_103 extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; @@ -2429,1865 +708,6 @@ extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCaus extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; -#endif // EXCLUDE_SM_100 - -#ifndef EXCLUDE_SM_100 -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; @@ -4834,7 +1254,4311 @@ extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausa extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; -#endif // EXCLUDE_SM_100 +#endif // EXCLUDE_SM_103 + +#ifndef EXCLUDE_SM_100F +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +#endif // EXCLUDE_SM_100F + struct TllmGenFmhaKernelMetaInfo @@ -4872,2409 +5596,2773 @@ struct TllmGenFmhaKernelMetaInfo static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { #ifndef EXCLUDE_SM_100 -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "2260c7504648e70eb2a16a24627679d76e9fc5d0a9e736452a4ac8ca4e25f27d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "a446cc59a6bf16f2a5f04d1e89781e12fba36b686de3dd75687f93634af7709d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "53789b1bbf5145756914025e3c191297408e66c7ea44ed7018e761e914f12ec5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "5fa49b67fe5c7b591fe263044c0f9bc09c2fe8a68a2517ed37c7d5dea98b8f1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "00534a220b0678380d4d09a89a25ad78801bd3d5029d6ac613daa32ff7e6f611"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "185a14940c97342680e89462ca43d5ed5014543000b4c8342fea71520efca058"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "df616ce1930bd169e917fabfb47a616888d3c3f1c83cfa2926ffb6c91de52b21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "86c30527fd0e335a36b67ef9777748de5ed5482ab420d3a93bb66b9462a68379"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "f4d6af8aa521b1e040ef290ca7040be56e4ee8061a78351a1a97d513347c4203"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "eed022407eea63321a3c9567443bebd10d3e7329ed7ae955ae81014f34d7ca90"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "7c00785a939f6dadc9478d123d0d136c50aa2917e9f1b48b190734cce4f70a13"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "01cabb44bd8cb6a470a09e47cf49acf8fcd81bb409dbe5b0251d7b9f12bff0ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "c9988f2011b107c70ac1f870e9b6163849fbbb910b67ce412241190be889d40b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "7f0016edeafd091fe4b2afe1dcb39780bdc8f3c643a5824f14fcbbb3a6d9eadb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "aa99e4ee714a42121a6ac4dedf725eb75e8dfb68011bbebc8152b77c5912e09d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "4e1186d369db3808fea023d9dc2d2efdbc02fa04a16117230cea9d8fff38e173"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "de2292bfba37d40f4dffdf2480cf63390bdd38e0830d9f7b05d559d74f63be5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "5bae89ed1f4cef4e55505191bb6b997bc8034a85ccc00d9145ea24122119d0a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "5ce9ef833309d25a2c4ff478a66c03a8cec651aa10534d5184c52d386f49c8aa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "8c09f1167ff758a0e4341f011bf03b97a58bc86c4b9d4695fe38bd8dce47b7ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "719bd998e1396e61af2422d5ac5ea76cbc32718da04cad53dbdb7e0323152100"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "be182aa08935cd3306bacb8f9c3ac95a596142c721c138df622f0ecfc4c6c612"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "43de0acd2e27f9e00887f3ad33687f205715f9b1b9516fd23df8ade7548f6060"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "0a9d70a4e3867634537ed4e74c4dae93b56594346a1ef0cd702b361324dbd652"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "d8e8e4c41eb5b383fe7b2dfecb5490b552b895b0c410b830e652f63b5bd15035"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "19d88e45ac74a78a52543edcebe562cbb41c3e827d9e21b34c3db60778cbb448"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "c06b1dd5f3541e984aaf485cfc5c4b855b6dc60bd35feca0e1ff12adc6568e3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "c69b434d1a5110ab406152e8d72223619baee22684cf938b3f38496fbb8fe793"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "64e2a8835cda4c237bb18b416a7802184864c9f1c2e196c334093f5b0e9eff0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e8bbe33434c86f4400379bbf6a8a339dd02afe1d0c3be92f559c226a737ecf7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "71e0f9d2ad8221fdd306649745f21acb958236756c469957921c69a201ccd2ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "e788053fbeb9fcc36e29babcb0a774a073eeaa648ab98130db8946451cdf9a78"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "4b9c6c389bfe11b6fc418b231b413a63c5f7ecb0bdd7d2e4bc37b37acde6ad2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "6c4559a81ef8fed329100e119caf1d0313083303aae1a29d9efb95f17248c12d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "a5732846db4c4d074796d033839d8d6cf96974e7779cd9c05c2cbd171bc602f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "9704ab2572674af41223809210957e1118d2d85111b635e41cca5e6e879481cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "578c62cf81b1808a5868ba62fb2ddcb6a977a30c26c0cc2f509d1318cac14b0b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "c3970f89101c56963f8046403c6fed28d1570acc01d4de7bb0d6f5af14665e37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "3e995d00cd9784f74b14808516223bbef3e4396d5f15161331e9fa1df73a42ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "c7d4c900c8e27404591e64a93123581a864aa34f0a672c4990faf4467de00ad1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "2bc234c9545da3281abb4ba2b6c68404696897226fd74e423ab4e566ff1423ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "539f53b21fb03f826b4af455771063fa24878ee89084d32761c6607e2ebce3b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "d63b7de7c4e174a3e907d71cbf8c6b54aed035a6e4a505df97a71076b8bebfb5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "f9bdbc9815bd67d04fa1c16691bba41f341ba0926317cb73e236e642a25d8662"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "538f2e9647aa8849256621cd7c9ec673b3b3477efa559f45394df27f03c25423"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "b1ac15019c02a9a7e79bf529f0d09ae617e3e158f20a4d44385337ad09e2b94d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "92d42e11f1a07ec84e4caf46410c8570d1c69b389fc83a752f52abd02fd58b99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "4ebbe595d650a19ebddf0bc2581c49160ca48269c7ef4bc47bf9d1a62952b05f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "dfb51e194877b1cbfd4411adaf41e974e9d3fe398c122d7ad109fd6ff852d18b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "3a82dbe23e28b81246c4dbcc705cb01d0085dc118c28e94c9c2dbf3207229011"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "02a3da985ae0b9ad894dbaf99b2b90ff43c1483c1b878524f0e23332ffa3cf1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "d58be12a89d904180df2c77bcca43070c71aefb5a883f62a5ad3276a8a9923f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5ae6087bb71cb800d1714e03c37097a6082a51b90295bd4c8bdd6c2b855806ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "86815baa97f49ea8fb7469b6b244e611fa5eae1ce6549a00dea2cc9b99c3a01a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "ce5516daede0c1b4eb151f9094964db314a8db918cd891db09d4c432bba32b19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "97f5b88c23ded5733134a3d081e1cb75a08462d386afafe5d8f9175eb8505079"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "24db69790c85e655e839a89d2f78e9d2276578487b42fcac366e6573952f49a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "cc906eada5d0e8c9dc0ce697029de660ec336ae4f6beb2d62778f9915d0a1179"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "5796aa0268144af191403a95fed456438221fbeef89c7dc712025afc07a8cc83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "e78273ca6d9e58650d47989be6756880e41833592180a0c421cf28604806bf43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "1ca4dba30d6bf455a1ca70e3014a73d11d51713f8265cbf0757bf7bfd59f4a80"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "15892fecd42d25155680886a96ec26a4aa038b86ed5ba758406658c1bb8011ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "d688801036fae7d8eacbddf1d87f43dfafad1cbae124ebf36126125691e12771"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "416c5e53ce53330d068d9ef3cbe959eabe6720440771ff5a06e0b6ed34a261d8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "488a7e0f8d73eb627c51295766ac76f7d54d72e3672a49ed4e71f83de2461bd9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "6a3b59b4ed0d67f50ab6a282364a4696ed93c429ceb9463451bffa7fbd5aafe6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "72283a0ae785fce1c83706bad4f38df306b1ca10c119738eeb3389956c4c61bd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "33f66c47ed556bf7e3db131516361515aff1c1aecc804ec99bbc86d26dba0d40"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "59541e9aa4347d12c2f7cba5f3481be952a1f04a9d98f35cd8d75723dc7b68c8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "da4b9dc0920d5f14e8ed36cf9335523c1bba191346fe340ddd2d208cc8250535"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "fedbf6d2fd9dff9ce29f04140ef157b4c737ee63087d72bd40422b8e8fb075a3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "3be4fa914f1e509ce24fec71d7aa7424787c6336dee5ff719ff07deb59f67db6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "e64ebce31f8d47d7202fed635a996806e919bdd40601cccb6cc3385566d7011c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "1cc07bd593fa27dc2672f7a3b88cd26fa72841798c12c490f5dde32b46dfd377"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "162b090e96d180d3919e20ead354f22bd8572991a726efc54899fb732a655714"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "f271f539020ad41788b83759b39aadbe20a3ea9cd01f164db9d5ba4ca52bf3a2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "9cf31a516c824d06f6ca68d95fed5242f7cf9ed8e99d9904551f33d9f6fcaf58"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "769f62615d2ff88d0197df4e75b23bb490aa71347e6c861241851a6fe4a5a805"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "c3313c043dd1f4c3d3a2e049ac023c9a048b7f597ddb00674252ddfb42c28877"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "46aef632c88e6769b6da686d33cb69acad48af723d61795f029c65490d4eebb0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "c57e79b6f33c4fed43d32827067835c9dc97d1ce4772127f4f7542cb51042fbd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "d1b88af835c11781501892b60b406bc5649da0ea82a4c02d786ac1a6773cd325"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "3e51300cd4dfee9b638e700ca652796acbe967a954a56220b22359de52eaa47b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "690a86eab7946a7694d5a1a53def203f5f57a7a0a3261cbae598f1dacae9836d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "a7a5f03764424cd6374ac52f471de7ce304f8622400936cdc3f00a6b054421ec"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b00dc41d7129b8f19c55bdc214716c6fbb5ab8bbc6648e15dccb2c9660e4f66e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "dabf054b473eb0ad5965c88d00b940574e455927ca3b68b45d0db89a3d78986b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "5a71a8c875af42c6d4bb19606f5ffe085c36a7ded853d4202c346c08d011d88e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "88495f23e32f79dac6c6cbaf0392c5631cb903c41acce92168ed7946e33f8d76"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "65265ed843abca30ece1cbe9118a5c8fc0e15fd4174b284b4b1c7298c51c5dfe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "506ca4a998019c8977b3130eb8fc73959d4bdc8d0416e9270a5304592cc49716"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "6271b520279913acba35bf49884d9065dddb24d631e3a2815eb0bd62df13afa8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "63a5910b84fbc2139c39a60c95e2469f3895da1829df14d50a5afe503d67ff87"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "db6cac5f5329a8a57471fe54a625c4c57c612696e85d3704d290edc19b6252fe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "3e884f8fb94f7e6993a599b38a653c24cd3154bf7bd27e8e5297de0e7b68d1b5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "5aa20be590bf09fae4379c76e00a1d58df5afc88a25abee51994c7f8c9bdf4cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "44db523709fafbd3388a1f6f217ac9ade032421e1791ed0eab656c49a01ea238"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "e6f7b700abb3bfdefc3709441db649001c88be31d137d195c6fed12be069afbd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "8b2c8423a3c285a01cb47c0c6046430c8fa066bba90491ac88e6cdb6f882b5a2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "4e79b6c5e26b146ae445b5b8aaf49fdc15187b209b2dbd4ee0be43446a36285b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "649706a034792a7c79fc8180290396da45e21816c8cb4d6f4c30dacb4d64f4e8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "c3a4135927d1430539b117b60336dec562b07116d1f0a5b08c711eedf535f43e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "b47f9ef7050f02dfe477aaf62092572bed24ddf855ebdb9e2ade77d3ea70969f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164992, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "1998ce656a413fce9bc2faae5bb6479f7e696da7fb122217354e260be146a68b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "3f07bd9b9e05a379e9e6c5a319804e17cd25b7f4f9083a07b5ef5ac642a7784a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "189384239fc9b6f94083518f3bd5eb557b7f00e533ac156c3142e4d2b4496369"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "1ae518e3cb884ad25d382d7937be62bdc658964d2f596f4fea85cab61bf69947"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "d0fbc8abdb980ab5fe32ad61c700bdbfc3b0ae27c2a650d9db82ff5995e28ae6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197920, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "f1bce2ff55ae86cd317a5cba0259f31e592a45996fa660d6d66495816a27cda1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "f6bc0a63969e12ae102267c8b84b92abcfa4f8fa22a740b204942c80accac265"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "466e067a3dcfbcbb46cd582602bd2b80d3cf920f608362533b081d190d81f6cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "ba74e14543dd04695470f1624b0c733ec064595b2b06a119c8207be7f257183d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "4bad7593089c03cd51043ae9bbf81cf1fd467d112b4fe7d5d752d7f3ebbc99d3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "f2c9f76783012006a2a580289fbe3b0b74c52be0995b7c2d486998e2f614b544"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "02d08998474e615d09afcded18ae00ca6815631eafa42b989bb208bb60c44db9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "ad0b4312fec61c3b4b66cc7ddd1fb75e0763a2e54847116948b5d39b63f36153"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "5cda23bb13bb9eea9197a2ca2ca8f0a9bb8f49e4eafb0550e619a72901d466ed"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "2499380de0198268ab63d3d2715b6c5622628183a90cdc7fb655a7cdb8e56f67"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "1fdc746b58dd3d5f82fbb309f05dde7381ef92cde51818000d600c54d3c374b5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "fb024db67a5967c7c990a1f3f82a88c2e881a7c9f27185154855c3fb4c3c08d2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "20c8bc4fe27bfd1d45e4b86b01ed99999e36b0406666b7457ae7756476e027df"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "acd66ed35f6a0656a0bff227c9ec3f72b10530a810938a73ba599f0b422ec043"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "d7cd24522d90e16a7e8c01e37820bf5c2574bed314a42b32837de7fbec2d3f47"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "ec180f2caa8c9a9b9238c8d9494e45696aa7e5f7a61aff645ba50a43fad31054"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "e7ae44be15e8d1ea69a4e85ee6cd34c6ec184d98d18be6c29fd45ec9291b8ced"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "7d43fc02c12af6095ba8b5677c8903222d6d37388c9a465b9989c4229f0bf40c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "f850ef50e66212bf0cbdbae56fd5681fc7b0ae99050c11a1933349b80239b79d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "006610593ac402c1132dda49ab7dc7456f9da243611560c05f499817d43f159c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9d26f7c1d68bfaceb14bfa6b809ba9d8226bf232c1efd074a815800bd0dca373"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "9685018c96400d06382aa17e41172f4b05ff02f7beff8a59eab97313e36bf792"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "84a2a2ff6a5e084071030d8037a5936015a5534c107f58e64c37277ae05e4b92"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "6441a0d53a6da6720446775fa423419b664af68b6f3aec0d71aa2a9d1bf071ac"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "d838395c277393ac0139004ba51b923bf4919c981ccea0ed213481849e39d967"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "bed7c1e5ebb7bf24d08cb69d5453419d77530f43b96a78502cd9813ba88a2cac"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "e11a219723d79fb9e1cc9d263282f51842c766ae5be9988444917b9612c6b98f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "11d1ad7f0b8a7558d7e0231a715c92a397958be54ade0a099b7de94823462e15"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e041e3ec2650a5ccdfe63ae9021d55978663581ee1bde7a709fd5b6f50646316"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "be13b0e0cc44908f930323fd7c1b11d50f0df4705c03dc3379f833cbdd54bb9d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "e1904203d31396bc3666c73cb78cf2d8dc2d6c4fcc9b141cbc9053d61d91ac5f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "bb4e1a5222975e322ec00bc98808031b9dc3d52aab03671adf66ba75badc839b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "6a4037efbd506ea3f173d1f39e6bed1c5d8239dba28b2d3fd885e9b1092d087b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "ad16abdf376ce2e7a15af536b67986a0b048c8f0f938080524c6ad2a68ec22ae"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "4991f9687e1964f24db37b4f3f0a69cb97233e49b87ccfe561a40539b718d153"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "64cab0db9c7e5ea47d949d6324eb52dbd3c2fa928603f47c65476088efd67d06"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "ac5d89b9ca791e4d6096bbf353c369833b20f60f8f5103e1b8eb842cca3a5f24"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "186c49e53d6c2406c507b7ea027c06a1e5c11e271b09243a16c8ab8f478d74e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "51e92da4618d346f36e3fe117ead9ebbe758f39f73bf90cabaa53f27776bd989"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "e2ed2b5c07711791e491e4a31b51815d2cf65b92087ea39b354da047682fd863"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "cb44ab1ab37754a1aafc124cb5a266ecbbeb8f17f18f2b3ccf604cf4042f9597"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "f60d16d0e50cf915c3ca1be63a3b40ed0120f3f587dc1b80ebaf72740bb24f2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "121b17ce33cbcd732210831f8fcb7260633dd0d14f9cd994befb7bf622b52263"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "c4efdd623cec0d878fd9188777e4046b320a447a0c656b4ec4a5b4d6ea9c1aa3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "af44651cdd759ec88fb799c7339f9473cebe9c92e888ff4cf6bdfda5dbbecc46"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "306f3e05317a576bed3cbc7483426aa9cb1488603e14255f6fbcef7913f1f576"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7fc1745772ec9f72806dcad826f67cbdf946a8dbb69acc807bdbf5c6954de20d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "4df7859e4c99274af78d5d3e36e39cb26d59cdbd6ad787e6703b100f0b895e31"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "f69060adf5000681348cde28ca46df558feb3e2ec818bf6398ee93172df44574"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "a3cdb54945a1f3074cbcd6e7252590d1ff07953e3571d0083937a56846faea1e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "f33df87a78d2c29d2d3a705d6e25ae6c8b5297b9e0e758998342266b6477a4b4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "5da834d304585ebb1c4d066e1c2332743c700ee46badc9c5ced265295b8fcabf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "901dde0f8e5ab3232b5319ecc6ae7edb3d668e177c7434032d0d55e28f64e0e1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "1c983be7cb136e3f6cb80e7d048dd30cbd34204a4d2215b38a933749a742b771"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "3ee162765b6efce0965426f17e8a03b442b2d0def00b67c089670d2be67a1474"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "b8935ad0347b386795cb9e9db0dc67ff922281d3792d7b810d1d32a7285f79cc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "224c813335f58efc68f1cb3042397f8bd04c097f582e0c107100928ee59bc950"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "a659a4855ddd397300e0e270f9b85ddd3e78f2197464ee90c9cb4e37a955339c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "9b34e61f65a481e221fa052e435dfe329b7652e4b176f212e029d0006e5e2b3f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "0b63deed1216292f4580f809269ea252a5e14a0d87e42ab213824e253cd34a02"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "2e7c2c1d0e0c661db990c35546fff096dc96047aa440443d6ad5258f163f4288"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "206cc391dded08e986735314bc8029a8181d277e798ce2ffb9af5ed3879b639e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "f3c0ec8863aefa97df944b2ebc1c85aad07769e963c5f3f8d0e1b2f85047585b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "1b9df995a7fea743cd6d0aef8c8da40cb60ef5a4b1fc9300b00b5a8125dc9510"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "a4f5ca2608441720bad60883f3bfc4e3c5f594e3e2f823740a0c51c849c33e18"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "4783f0942051a940c916e620ef24ddd9e849a7d61f39d47fd19bd5f91710521a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "3fe7a93a2753d7f1dec4e2cea9754249f94d7af6f7878d9e8c4a5034d3ee8b96"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "9ed4d6124d63b203a31edaf101af6beab333c127051e4f4e33447f9fe5fff4c8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "285bd1b47ae3b9f11d48718e5041c3280e2b4f7a77533ba8be1959d7980484d0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "93afbb7efe1d36c50a2ad15867ff7ae459d84bfe56de3fa73d5f9ef388943f95"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "cd4fab82cd6158b3d92b1e9f75993e66987d145a29e0cd34a5a21c6afd9dd712"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "2da2b356a4b730bd3185811371db7fcbff7d1fbbbafc25df6703451ef6fde0f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "e496a64241d05cfeef8647708086d42f623a96de53a7b0f441afb10772f0d7e2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "43f3be64cf2ee8b56806b1f459f57b3ff5c2c4c4801e8de92555ba6fe4038758"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "eaefd03a76e8d8b72f07d5940160dd64c93fd96d4e013be63db645b2695840b0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "2001a724fe68802082c5d9d75176c38902a4d1537a18954ee9cc8bc1a6704d17"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "c0afdc5fc1e1f56ea20c5b5d3a6834a7173ac1841b92b4c8cf3352d1fdb9a664"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "9ecdcc12ca6881822dd3a9efdb469adb4ab7d813b874f7d98f69c68807b60c6d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "538c4395b0291f203afdf625bfc4de0f077092e757b5b4d29fd9dc2da4891688"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "211d0be2555d93c92720743db5a43664357f2d4550b1b062709b4af6b76a1b5a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "6d78288097d5f14e963f9c70f62ddd238a97913dde1d4d8e0caf5af19cb484eb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "a060bee6b09f776d1157141cd936ca917f839763f46be2702efb0825a64c6062"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "936ae7b86f275f7b8d71a5e7f8fa3e55e0aa6b16c754076daa97546f03f093e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8c9861ec6114d54eae8424cf81eb7aa2d5d0f748797036fc1da5e8e1855ebbf3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "86aa0c7699eed10c882dacf873ca3f82ac64e27bdd52dc23257eed3d2506a326"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b005506be94a79eb7334cc22a196655216afd7bdabce7002181daf7dbea1669c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "d497638df6a150794abfa464dce1272c6993504950baf615cf7306bc0881d172"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "2093f6ea4fb071e65ddb99962a237468ddca660237cc45ff8ac65b72c8bcb9da"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "cfa969684bf3655ecd7de03cfac3df32a300f46b14387e07b331e1ae24fd112b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "90a215b9f24f82df061f2a285d8be160987ad4bf5b5cf93297e5b1ddba4fb664"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7ad670c11803915cb3e81410e3ad8af6d43c25dc97fffc0953f2c8d0233a489b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "f6e59521beac470857859d47a7277132eba4779c117e44257cb08b8086f6523d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "f7ce9fa4e8aac345f2cc631458ab3f9624d0eafafe82aba58f8722eb471984fc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "0d7e27f0051f4b387b96d6bbaa76774a9cd5dcdd74dff7c886c5638faac36bfe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "cc1bc1f20915e75af4dd7f768b3e4df5555979e47ac71bc04340e21f7c6113cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "3ef735ab6a81895d9514a56081462330b99cffa49c3ed1439ced57a3ece0eb0d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "290d42cd7b9e340b6f8e68d79a014db0e136d651873a8eef0c03d477ff19622d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "66cf1e8fa0852f9069cb7e9d57425dc72ce857a2d61e0dcca481e5e2f4df983a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a1a5fd764f266bfdb23cca7b2f8b1d0d90fb30f3abadb969165968efd228a232"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "aae62f3e14bc22ae6c0944fc6890bcf620d97402cae2992cd38ea11a78b9bb40"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9545a67b9d0a310393e2cb9d84ed38485eba4a52476b67d8863033ac2dff4da8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "d11f019bc5bc98145cdb85a29c98c46bb03367ba2e1d83f99c147e65e3df56e2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "9b5e3025b208392c93d31243ba865532bf2600454b25e620675ada2376bcf4cc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "ef37e432226b683bf09b2d792e5cce47f105951a99d76f2f6c9f30ed70024954"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "f5eb67a47ee0ffc3bf3fedf68641f3f67b586261efe4ee87b0b36b1270378999"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "f70a90927ed5b56c4c0d6305af4077d733558177ef47aa51aa7227d268ba59bc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "e451371981d99ec10007b957666845581ef1ce16ab04a4810c5fe949b0d39ce1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "218cd63f2797eeda9fd45f8cabe916dbd7b3fd3d52dfb28c24d6d8aa2e445212"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "a1b06f82e3c41f4f90b1f6dda22d9cb846dc228685bfa41537797b1987d06266"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "bbe8cb31e6433a3c2e8d4c9ee4bdefdd4492dc830103e85e8bf439794232fc98"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "2da1c171dacba2488427c06d4eee49d9b297cd0ae3126f2118bd7d37eef9198a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "f694e7765661c95cbcbefd6b0fdadaa79fee161b2711f1d0246648457865ed45"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "9fcc3323193a2877c85c614511a4f0633429c590105ad1ca72313e040426fe62"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "ca38b61fb3e4bab503437e7e65d16c74b73943ee3fef4339abe3772367fe70ab"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f66f73fdb8313a9a74864f2631e8527a34fb1752d207aebdbf0822c8cd4abd8f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "377dd02e2de39a86d14c0bbdebae961a56f984b243aff3f70122fc99abac8ea8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "aa3eb676868c4c62789877a82a9412431be38aa4014c4586dd806d205b7f1f27"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "3bdf916a2acd06d499e8d34f566e2cfa6d6d2a3216af6e4e389f1b3588e2e8d6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "b7e83e308c03253024bc474e681ea4901689c9658f7e0986f7874c314edd2d51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "e7b3c0e9ca7abd3f9ad3fce96c60af5262fc46f35f0555c25d85e64d859a0775"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "4bff280c610bb4a0977c571395e5714788383fc0da4f6f705ec81b26e212f9b2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "496c2835e1eb01aebcbdcf6d45b114c3e6a70983fdcbfac33b7074baac5f1f2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "2d6e60da111582b890435d9b44fb94ff4fcb98dc00bdd8f661143b97a9e0d128"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "097de5ab6296c665488d78a6e8c0582227086a93945726d07adbc4e7d7f41757"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "d93198f91697f85c7388d96544fb582742e13fe57864a18cb79b1c7c0487af6a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "99a9c8b5a3d8d5ab147e9b64ec0b6cbc946c17e52aa98ce5aa4e4add6d8fdd56"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "15243603fcc13db2f4458b9c30d7c98f6425fd5ac8cc982c862f68806fc38969"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "fe43c68ccee09a29a28490931152633b468f7fd747ea85d784af289b28ebef80"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "dfa607ca5490ae6836fbff1783dadf52feaedf2cf16c46b2855dfd01110ae07c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "a47dcd9b859d55bc8862382503a1e608b1b459e324644e964cc274f526182976"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "b81e25532cf35dbcf3495d7612c685f4a9106e1d2dcda032d7fbc2b1fbaed8d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "13ec97d95d644d77db7b9360c6e9c0759dbc00df98337df19243ebe3e789165e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "087c037d81dc28ba3455a3cec4b6a8f338e86ca2e701f3352a0aab709e161132"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "b4240d308737b76a4456ae6065f891262a9352ae224d5655688e6d36811e0264"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "eb48bbf096247c39644d0d8a23843912db6d231c35a888ba35b9475cd7d33c0e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "85c004b8afe737af03a9da7c0f2cab6ca6980e44c12876b67371c03641a443e2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "46a89c70271f25709a9486fa14cdd786d8579a281b98df3942b6cfbe9cfce83f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "cbfc7d37011fe342e8edf8b04ae2ea0f26c3997be3a3116e2ce5bc422f415b99"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "ccb4d32ff1b9b2984f8d539cf504033a993cca7c0dc0bdfd76b08614a1f423a8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "1f8e1508f8d5be78e095ed8cb921f759b48564620b7d13466ecfeac7f4ff7512"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 210104, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "44dc880720019e93b13c7d5bde7a5474e4a423afc42672a5f65132ae1fc54a35"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "4c0eb1464fdeaba9a9ddc1864b7717b7fcc5378c4cb62b7a44b5532832e33ea2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "d6616e9a1a9bc2767696a5ef1babfcc2d277e53b4d781b0424d6c6a9779de113"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "1d4c3821c9cef5bca1b92bad36b3ba4b339ac04685fa1824bb58f9b9e433da77"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "fa1992dd4096c12b09fcbf0b1e6a55a29d2e59a1fc7b9e0071474b3e3d9e25f2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "c001f23d78a8ac09225b1e5d49ae59c2e17d61099cd129dd69e86af819823b51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "f7c3695328c1b2e5cfab2bdbe87448fcb899304d214bea56dc54a317a93b5769"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "3b65280e596c808e40f13ab61f5e54f8c5c60f9915174062521e67517c9c446c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "999b7e14117d35b3863355ac0bcf18975bf5eb87ef1a6e0563daf2cb5d70e8ca"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "c383c023f38e792c3c1a96f5642bec0ab148e2e535fac973b476daf90ed179ce"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8b82966d5b54a1486cd450386cb59008603504d736137d89c39a86c68df17767"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "7a865f92583239b4ab471cbb909d2e5291fa239e621d7fd06da65d21a30a1606"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "d00d1e7cb351fc9db76f9b426fe8cd42e816b1053c9cf8884e1b0ba22754cf03"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "8d29d2bcd3ca61cc942a593c677969cbb3ae3c589aad4b4a8caa42aa18396740"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "1fd68cdab7619473fccaa41a660c1e2d68f38209b7b059e6a95427c0441438f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "a100cfa5ae722378566847f0d3bd65b7e93010012f37857f9031220cdbb0f30f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "47fdf6e8ca22ce1c723a2920a5a1ac11c3f7731dc3dfd106c1dedd719ba11419"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "a2839f55024b9505d834d8d43fb6e4286d4f7053594c145d2f55f015e0759acf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "2c8f04915501cc5b74b3ca59e67cae4d125111fcd294f1013459b7803d46d43b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "a1a8541c7719b7b67bf55549436ec2f42341bd23815ca2a75886506db151577f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "c51033cf412c63debdadfecb8204f79d20d9cf8038eae59e7f824bb03b2cc245"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "ba3581982deb59c85b86a891d9fe2fbc028be397f7053c50cc8114509433f806"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "472bbb35f16af8d558af1a0e1898335b0f40a5631269c7361e468c70a592abfb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "edb70de2220a9eea6f3225fd481ceee048e81cdb565d9e04dae719d072bf76e2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "b67cc85002dfa5b5d9fa42a895058919ccb7511a4e5c36757b2f7e280f4f0472"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "c12e2d90246553b5d3650904321f81a5a73fb94510d610ca4fc84dd738707f6b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "f15b44a45d1d7a8078676334d36bfb621e533e57d5f7d686770f82cbdd65863c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "91785d4b7a8c64a30576302cd011b4ef756f7b57d37d97a47fa8adb7ee47d1e8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "fc1234fc2944da5dddd4eaaca2e73076b39d621c0df4658d232a72f67aec08f7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "921d420efd6f16cf49dbb2fa62c83e32832a74ed25fe466153a2265c7b5b29e0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "06b90adc53adff3f87682d83ddf073fd5e18e98e6320703915a156b57e5f04ad"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "9eb138eee33d5f6a4c38e1213e00ebebfb49737048977b226b135a014025082b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "c80fd7348339b0c114a418d388796af0a690540f75e016f41616a7625d9ad5f8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "147400a03b73e613f26e6d7a9cf492b8b4f3db305c239b5bc9fb90c1eb868bc8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "0fa041a06ef85b02eab81202e84b21df79a877fab3a03cd41af7c9fd16459a98"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e4d955c736f74fbc7c23dfd0be6eabc190ab4809b5475eca59182eeb411b9931"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "0aad7e1e82ce993d410ad51a7aa9d75fbeadde6d85de94fbf5e9aff778e383c2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "626cc32c4bf1f0edfe6e1b9ff3f6019534bdf6f7129522b5d1ed9fb9561bc0fc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "b3c4e167411b6cef88ba3031f15d23d3f82f49fed117f551ee25d734a9491d02"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "c5efadee6e5626ba33ab0171405b0d455c1f0f84889ee9a6119eaee930eda283"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "04661289df9031920fddef1552d51e05bfa0bb24f578301a4ec9e9cf58baf998"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "fea97bffe36a53cb0e685bbc264d1b7b19a05a37eddba335da044e3802d5f239"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "673ed15dfb65cadc03c62342ef063407224cac6fb5b5768781b3e0b9759172ca"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "496b1615f2b38050829710842f1386659b60619ef7b0733bbbeae2527f7cebd3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 210104, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "47976282391d2e4b9dc541790b317d6b7b5ca9a1d101e6db979873962851836d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "bfa34b655ff5f9a678f5bab1be509cbec7d5ac9c2696aa352ec1967bbf19a995"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "dd5ed21705d8e9fb7322540a84ddb3e86fe7c8c363f3993ce1b0b1307be644db"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "73d07e6bebf378020236c0dec8dc574b9c96969cc7400db59817e12742013ad2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "0a8b3df72c44edf8fbf2d28ac49a4c52afa4511728feaa85d9a5648756e9bb5c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "b7298450fbc3d878819e37a664b530605adeae08040caa847e8ee5b21af830a6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "8a4884661f6a2533e2aa249e7b7bf25bd5d763886ea16f116c6eb62df69b27de"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "42106469431bac234e5ac08603aebfd1c53d887a539f0c539022bbd54f1c3ebf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "a4074a11941627816e4bc159e89aa9f3625b69f25ccd9923830be8075eeb22e3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "390aa3568923bceb5da217e56df9b3023dc1542cebeebcbcd532418211e4b6d0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "8079ac282a2184132f1e37e86c7788179cd75db7f3673c8b9c221179f2503f7a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "10e071e2421dc7c4566033fff53cc97944f706cfef90c5728d649066f2de37b5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "de55b204e744e89be8a2ea77a996a2e82f4b97082ee88bf3b60e1424424eaf0a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "7a2bfb30076bcc8d96bece4c9cda55333a452992faca891b36bc5da74dbed67b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "19b4f6c31652d61ed42f6c8682da4f17b107df1ebd077381543d841366c41b3d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "a2ad71c0f7ca98086f2cfff40cf9fbfa67f45918c31fc6f8dbf5c55b378b3409"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "06ea7fec13c189faf4d52961a300ef06d93cb9ef1d19a7be7bdbf64ba0bbc39f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "c40bef75c346c73bee24690de450d558cae007d4a0f7d82c9e586493d6ed8f1b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d8bd9bbc61a0112520dfc085100f78439e4478a7db5174ea8e18dee33422f3e9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "14415864d3b3dce749cd98caf67c289564d50bd7dc96981ffae52388b6bfe0ea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "96e47f915175390225215afa271d599f64784acc8c836d80771f4e4c2ab5ee97"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "5d66545f31b72ddb7d92aea61ecda58af8bffbdf8235de8abb0da9f77802c25f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "973b30717be1e1cc423caaf1e4b405ebd6cc1a1da64ebc4a2cdb8fec21359032"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "6909196f0aa9da495f9c8cef0d2ece499b08b57059b07b60f2b2f8624a1a918d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "51f894922057a8bb519d914278d2e9954248573c1d1fb02d380993231804cc26"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "9005fde8e356f64ffa0a06b2e532071e4291570c72057620962875c539c9ff52"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "86cef547df37a5b8e38a7958a9d64bc18560f3373e00297d1d3113e8f088e7ec"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "f2329541a64c0408fca37a97965736689507d45a392ec20a3df285ee257c12f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "1a253a3ed19aa635c356872730667b31a7c4099294a0d66b4ede25335fab9903"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "e53d6bbab27a77e7e84159e54f24bd3f6a001051ced237d1f732978852f40cdd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "ea4cbe492422f159f9e65cc8f1dc1cfeb4e62bc7b615b3ad89964bf6788d1606"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "64b3c5b60ff7a272dfba27a2b0ef3124cfcdf705a1c92f68c01e342d1be65052"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "f93e82b08c9f082679e8d3c6a0fab93d3a36243ab7a055ec0e4a7b511e6fd543"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "22f002be5dcb09c2a94dbc5cfd1dd6d4627c3910cf5dfcdf8e0c5c0ff2bd887f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "43915e19973bd13c0223eb8a39356965ec13b5501a77ee72be32e8109087d033"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "2036e38d222cf6f705050922430bfc6c44e79ac9d47b226481ac6af9c12d4847"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "27beb83556819a4fe1e338f8274fb5224a0c9ae4e603673741501f1d022f55af"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "75a5e5099dbbe55b0994f4592793832fa241d1af493366bbd508abbd56c0b0bc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "b680359d9d7aaecc087224b0887b2490c7e994fa5b7b7788db42008a5fda0884"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "17afa7106ed36489e6f46fa4be6b5a4dea53d0c727758d5132d91631928bff0b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "e3ccbc997aea63bcf6a7cdf22f43e1e10dcc017be0468b8a4e62d10309e20045"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "2289cb7fd6daa3fe6d13e99b04037921fc74d54808468bf1450b826d7732286c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "62412887f48aef9c65ece54fb4615b4df155505b2e1980976b21f711ee0a78e9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "080266dfac39ca760d6b6f1a216251c14dd5e10127e7bc45b028674eb11db2b1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "1c1fb33822ef3d29601e90e95be9e9007e5fc5f48f4662405e3e4f802ebfd513"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "dd8fd244529335ec17a64bd4f1152bec236af15e3c0848c485a903a5730fcef0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "d93231797e341d4aaf0bc0b683821f268cef55d6e4bb3e7de8308816e702366d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "bcdc396b10c02609e2f82666cdbf5c11a99da9be771558848f65f23edd6808a7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "1b1a54fbb6bc3028321a107b9ae3767eb78b5bda5ea69da8a85e6a945f671be4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "6fd61cfb79bf0d34073efaf09327eb1a591d28ec16871fd8a53e9bf9a01f7f8f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "79636c04790512c520df59bdf71c4bba10458b02f6489b0625a4d7f7015456aa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "5d2898527388b081e1b680d3d44948acf758a1fd44873417b57ea3b8c7835a87"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "7f3372f20353e84cc08265d9068c5a035652c89caa780752a4bb328d6a5e89c2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "9c0f60fecdb92668e17cfe2bb0dede8d5b3a33773659f63bba813cc185dbf51e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "7a7a3b31ff0fa14e358e5fbfecce46fd73c9370c69f4a9576c27990c94429089"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "5b412b792ff6654186c1930c0b7ee4e5d29b32a23dae1d46c284e716d98e0da2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "6b84cf8db715aa2401d93daca2b8a5f7611d230838f79c2be08358e1bf0a2e37"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "94dcf9cd77b2a9cb9921766ca5120ebadb136a8a3f67e598b329362ce220463c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 194744, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "fb0360b57a7b02de73f4d77c781d7994282892ef742d7930ae9157ab9e64008b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 207112, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "1313ca53f6600cbce7796777efa23666ad552725de904b8a064d0f6e9ae93774"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 180792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b3b7a134b8e039f5f3eb8f2ff2bf99ede0a93d5f1deff70d8244035134049a09"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 195208, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "85c947eeca44c1ad63b8f8051946c1b2905463a5d34aac33869767f53ba873d1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "62382ea609995e843cf0d6a95a9a8a777718b6eaa86bd2da897785d9fc952afc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "9797fcdb8e7e1973cdcb9e06a2e7439c87af4f3898dda223ffda026b59b4bc6d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "8270568df7911e3a404601f20ccce6235ad3af9c1b1e288031d6c51cd57cfa2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "2f884634c22d8dfda1f518d3a6c03aabbef25cefd6e50d62fe54ef5b62b6204a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "6d1a518794a7677aae46e36184b221019db741e3044aa90cd7dd6199fd53f401"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "7607c443bc336658fdfa18478baf3404e626656444169d437b6f3eda9b8403db"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "7ece23b2f61b554fa89fc37c429819db1b38a3351642d5d654094b755c9c450f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b825a1ed7def84ca9f1a4cd0ac9073858bede590311dcdb1fc7ddb555cf6040d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "e42d7ce3c9dd23f6ed9af11785b3c026b97902b2954043c8e708ab4976ae74b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "fbd026ebd60396249451fbba209650700094ed62db80c15bdfe38b42189f3422"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5a81ebcbe72a842d4999329df11ac703c88719ce7571da8bab602b69eabd8dd6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "7908b919c4de295333512382a4ef8ca73c4950509cfdc718091dfb4b38c03c25"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194912, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "c16547ef2c205998ab041ebc4e7fa0aca3c4cf3503dab13a7d46aa088f016e49"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "5a61272ece58830fed7bb8f05ceac9b0c41a58687c82cd75ad8c647f0664b91c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 206064, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "8baebd16e53bb118ae04fd2a2e409e64b1471de64d360f8c6c6a5eb0f06ec7ba"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "7b4dd99edc0c8d38d08f353d9ff8f020e4ac84f08a9efaf29f95ac6508eb44d9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "76e4311619f5f53c96757f49286dd0134678e41a2320b05ded9babf5715b30f9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "6747141ea83456b06779891c2ddc2b9e5995ab7c606774e5de277e9e46e495dc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171872, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "b709b396f1b8b522ee0989cdcce81c6013887c88d94c392adf06bdd5a7e6f29b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "0ff28e960d09228d7d6cbc1ac5156b0d7dbca8a97e8fef29c9ae4599dd9946bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "42b1c3cacff21f8ddd56431adfe45ba655c9b0a65e6ba14826094cf347ee0efc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "d19c221675304eb42bd4a4b349706336bfb71e39cf42516a31667d31f6235ce6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "cd8518faa394224eb08c3ec5760e28b19bf108e0bc9ac80b9ce8c5c811443724"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "1bd58103d19c4dc9c171aa6e9eab7d0e44d7aeff4a8dbc21baf108b98529dcd3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 206080, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "ebc2f41181e73ae7c18c0af3a5e286b19786d844ec656d9c37ff49f3298238d8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "b4770c43d1288d5e0b5189ad0f5360c54313cc25354af865d2cadc22611becea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 149216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "d06bb4b8181156f31e2b17ef019f8843125a1b384c35cd4a13bcb11f64825023"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "cc5292801cf3402de8fc2505bd51fd9f6f65f3f7bc670f5c03998b94ceddcd00"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163632, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "605326e4a2b1c35990b0d17bd98b297e6cd6babc388d9de7e9ce2eccb33e410b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "7801d21237ad5a1cae324a59c6ca41af6af84aee7596f6b0dfcbaf1d38c2f317"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198728, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "584cea39a27b3c3243efb94c16ba22d308f6d44acc044e66c95276bebed52ddc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 185032, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "3f185f7d9d6f35e527b6f682d776bde4ba955b91dcb282076879596e7d0839b4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "2bbb46e3ccf6bed2d3864449042be8a4667b787c5e0fa79cbba514661b82c851"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "b8debf4217dec81cfb3695bdeac49f39cd7862f510f4d1bec293094309b0c5f0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "edbc0c58f66f887cfea9785935d5adcd0e11dc82b5dba4dc949282462a0a169c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "ef1673126f97feda6fcb8f3c557a2605451b869c50ac795642214625c26dbe49"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153456, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "386137e3a53a678cfbbac263f6b6124a273a07e796495a34f1646c995dc4d05f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "5101b91e2c2571554ee057e176e31d9d42094af4e94895d47e4e464daef9f87d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "3a2a20c14667b98e6255c219a0ea31acc977270f1c14cc282d3ef2d06e4a517b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "fa85eb52e2d7bbed28d84c46ad599674dd2a09c654a4facf4b27271cd5d505d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "8ee8c5a475661b91d6be7267fd5aecac133494ba59b02016a643b0b7a1c8e767"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "02c302f471218c861499a78191b06e0c8c86174846d5c0ddb27de2f23b6dc2fd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 194232, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "7c7febd08de08a94e7cfb2954af1fc05d6ba8a585943153be99f9c5fa3db1ece"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 206600, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "bc152a55b2afbb2a39ae49a7d97e7e327468b8e4ea38ee8e682877bb0f30c08e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 180280, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "d6c5af0bf4994f5132c639ddd06f135e761c4157838e7f7e5d54424f6a221a3c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 194696, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "1d7d16470ede90e5aed84c360a245ed29ce00e631774ffb263e2ef8b1c39a9ad"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, false, "c4c5ab9fb8cbe9234876c37481d298112ea784e649bb9780f4d28c61c25c88ca"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "41e755de7fe1e71ac38dec5bff76e3aaa4d6dfe6cbe0bb3639724da6ed1254d2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, true, "9665fe28ea38105bd0c8da0216f70f60c49efe42d3fbc9641f5cbea6f63d7abb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "75e199e07c181de2e72c90b418343c24772964ee1ce54834738f72d770092b51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "c42b2c1674c9ef5ebaac53adb031aa0ab0f3a61d7549f7828fe7a5b0ac8a424d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "3a19628c75aa54a364d915cfa09808173e61703ff125d5d215a6d9e335bce360"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "0dccfa2739d490d1eabe9c4219a7ff41da31cff89f69cf9f6c8d7cf7a2e1d4f2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "79a0ab354a360a26eb4c9a67c55ff054f631df2adc8b254379207b715e5c5658"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "ef5ee83338c01c2ccb65983930d7a168c029782b08f0cae45fc3bbbe7974daab"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "0c54f792432de07b84f652bb5cffc569c16f7d81602ee15a4388ab92d0ac06c7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "87ec3fef7316bf15eecd99b510665bec9835123e4b7fc69f31d01d8f948c9d66"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "309db09766b122123d1eead7782f8f1c98d7fbceee9cd6671f5b5612a0c2f66f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "4e462514e0c89ddaa624a0f9a2853dcc491ff8f13457d9b46edfed1be14c97a4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "189b1514e6b71c557ae077efb299ecddbcce16c6985902083489ea660ead7c8d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194912, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a2b1b0958eb88a3cb8477cf1efa68c2a2c6ac547e201b54dfd600f70e375132c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "804b3bfbd27948ffb162409da0a7602dd0204ef9936bb994880896cb7e782d74"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 224040, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, false, "8dd2bf316325a953d1f62a484d4e8223a7a04fe4a64bfd11bf0aa77bf2a823b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 206064, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "a6d2b57976e31f3cfd109072487f900ab5e8e4f2610e52ca48dc94c6055c7a2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, false, "80ef64c5e2ea4dd1f344cb1c084df48c8b6c10d90b10b9e6ec6157f073ea6516"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "e51c59ff8b38c1e28f8409c60a1a91e45d330cd7f642edf328645d99194ee424"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "0337193790be7a9425ebbcac0bc7ef4057a753969dc8841251203776a11397c1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "19a4dcf9b4d88fc573b4e24b3fa671f48f8db448c51e782bf8809b502385a991"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171872, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "050f3a2b0c0f6209470fb5106d08099e29d799b144a95ced6c385b30a4b304f3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "734eb113fed941f5294f004191d7f4bed651c87bddb74116e55ee42432b8ad29"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "c9ea3f3f3f682ed33d956e3162d60d91a7bc03592f43237bd067308a99c7cd75"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "179234022192ab60aad40ca78f25aaf1b3165511215afbe3654d62df27f056a2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "b070a0f563fffbbd3193320b243cbd764ccc0bf6c5093b591676a1339d24ce8f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "caa34b58a7004cbc57c0704b104ea69a8c4c1faa0194444a64d17fd9909c340d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen", 224056, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, true, "4891c22d31fb5186ce805efb994c19f75085bf0d76fda7c14ee915866b41b082"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 206080, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "c2956e2cca2afb1a4d92ab75d02c52d7defbc6d6cc216c6ad013cc7941c57400"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, true, "c1bd927a9fdcca2200963c1c90389efecfe1db050c2e80b7affe43299d143b51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "5e13e6b2f3a46a983bd270ca0d22120dc655f1ccb51dc919b45b1c9312ea8d67"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 149216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "a2e588ca50c420a36d984ffadd6da0c786d5afac1432953290f9dda869e311a1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "9a901e78d49e36820f88832a1167df43f3c3b361d2da37b7f3d1afe96c1019ea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163632, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5d4e2de49928ca34f77f60aefac44955fa3072bac250baddcdc5027880539967"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "35d2ec3c5eca0453db563af25307cb5dbdc07e3f5aea42fdb4b6af3c0ae61044"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198216, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "5e4d450a8357b2fddb62c463cdca78964bc2b28d044b41cac2da197fe2c69533"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184520, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "a5310fa5b923590cc052a4fafe488152924ba64cf48423c0f15e887191d7ae41"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229256, 512, 2, 1, 1, 3, 0, 2, true, false, false, true, true, false, "da9236e90508408c161cf61611411a19246511b32e25a7cfe65ae920c0edcb73"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "234ce169292ce8ef7e3f785144e5653f86bc129a04b29ede414750df5f9c27bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "8e50a1eafcd20cc7ac84020812e070465de7225a54849c95c67c47e0b6e605dc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "d730b19f453c78c3d28dc177963ab6d3ffab79df02b2f6ed9d04262c252ada47"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "c6a1fd526ed818c5da48e25f8a68395b0d89b6e0d563c96bb6efc213b2aa987c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229416, 512, 2, 1, 1, 3, 1, 0, true, false, false, true, true, false, "d7c69a6c6355a6efe76012d35d387930b718230d481bc435adf50602dcca2a9a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 229240, 512, 2, 1, 1, 3, 0, 0, true, false, false, true, true, false, "28aead4bce95a2f5b95c36848ad5f3399eee9c99252ff4d3605bae0eda58bae8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153456, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "4c8e34e5ec6aa359c7ded813b1451ab9f9036d9fea80bfb8e7fb98304fd73683"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "b12bf763710d2e21707b290734a8ed5d8b600831f098d92b72b00a2495beea59"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "657649d448bf1ab1f5e615727d81195be97782c6dcc05084b75be369c9104ded"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "4dd05004d1391b3c8613f75c9115f26e64b0bb247195c855963ea97a33e727c7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "9619b990bffb91ff43d2dd51df3f999e3cc547fb0526f8083860c70c4bc659e3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "dcb3566556e8ad24c0ff09e15ccd9553af1435d7a5f6753ad5bcf76f09af9824"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 193976, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b153c5f869c6f0ec5f02fe2dd9ce64f31f692fe5955f32d628d9ba8f234e5847"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 206344, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "5ff2b043c4b9f643ebbee66b96ace430f6df5a7190c93e4b876412ee9d84b695"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 180024, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "9eb642a470c177296dab10a6f73f0bfb686fc8d785b668c60108afeec94d1679"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 194440, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "e10ebebbfb674534b3852e5ce1a5d2c515b91e42168d158e7e0860db7c7b2d6b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "46c30ba02aa3d1c7a1dd22c1f6ccfd5be9e99e9a616c498f268b805b5cb766fd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "e03931e12a5a71665de627e931edff2f7de9c4f4c915554f1160513ce548fd81"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "4261bdc11c02cdbfb19cb6908f2c74455d5314ed268517d567d894f1b15673ad"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "56095790bdbf7d619656d4b2c6247bb5082c931fa0c710ea3422f02b69d862ee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "daf2598a7f6b0316ac9dffbe6b838cc459a00b2f6007a7ac387ec20a6f7e7033"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "377623435c693cab4dae627f7f8c28ea4bfa7b9ce0f05ebe67c0fa6e04628f69"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "75139ec9a43e6d374c5aaad1c54df90267dadc0172f78ff2cd83e75166491fde"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "7e59f9802c65aa956d901aa15aada406c1e85b7535b1bc0b6ab88a7c82193602"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "9a196f0a0f291c72e1bacb3f84a66eadad4ff01de03f8aa383aa8f37e8f18236"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "7f82327167f66f08f1bfd5c0547255ae5c47911a6c3eed31fc69ffaa136b044d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "6afd36a748adb8d55a7b8e118918ff8d2b8b23958eec159d3845539b0335fe4a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "28efc69351348dc1ddb7464ae63865f06a399cdfb3a2e2c822fa94ff469d1519"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194912, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "033c0bc92970eb37257878be314832d600e00595482bff3530c018aef21fe9e5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "552a600d83cf74d09f74189dd0bdc83ad326de6c96775767b1585d95897e8842"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 206064, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "6d6274551e59236338bedba310b64b96380011a476744d22caecadc82ac4eb35"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "d906be31c72afd3649af365ccb1598761709241a95539ae7779cf3198cb32692"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a74b1e416ee08bee596efca8e43f86959c311a1f702f767a843fd4b6dba0ef88"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "54787f41224fc993178a07d8d30576d48197f01b60987ff53ff5ff0a0ede6aa4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171872, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "38fa4017c014ae5c6d8abf70e597bea5197670e3ada102077799bd6a4623ef4f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "a00b832a64e95b4101c7df89dc481f1553b7ee943b4515e19513aea7e06d0c6c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "60ab9e88b53c3fb6ae612dd5e8ac59f475c31802f9c3148311fd49f479710cf0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "9b6b8933d4533151c8f4c27ab72c82eaf64dbedbf5a8a896017e7ea877817dd9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "1b150f001bdef69daf77f9c1e7e929f3b6ccdb02c24a890109c4f088ce9a258f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "14efa252d5e78d94a495e7f340627f41d58b396763d0f5777ecc4d07128397eb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 206080, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "e1268df62f894e8a76ce4bbb73210cbcb0154e1a24cb41b89865ac8ddd4b5784"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "4b2b81d443c8acf0096f829dfb599b176d4e691c9a10e70481efd59a1a2ac2ec"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 149216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "85e1fe2c09f330033773b9cc03020db3518a51a40c8aa507ccaacf4b1e145ace"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "b244d385d94e013d489348bcb103d50730f61d4eda9f96bcbfe1d0c754c79582"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163632, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "e783e8021b6946e46755b26fbc3957cbf3446b02672f42923bafc7cbf89767be"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "04b888bd03f6c0143badf75d526b41f61757e26f42bedffac76401add21b72ee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197960, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "bc99400a58844616123a4111bb51d8923bd6e94ab5ac8d05a8d9a86524d5ed03"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184264, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "26f1b34cddcf6052271197f96edc034743b4c33d241006746ed6c3d1d1d82380"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "b76587ac9e3f6cdb841ebef596a2da8f1e5cfcf0eaa4b6a481bdf1121a84292c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "6bd87f30ae281ae871ae80bc3c32d89dc183f6a1d84e37fb604e3a5b49094304"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "0c487b29c84d336ec83dc5678fab9010aa7ce84d7e955282d606071db5581e6a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "8da0a5187065408081f89583aa5411bb9c7e74c7166c62488d0ca0043bde049e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153456, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "386d09e6d6b90d8470dcbd27a9674cb0c0fc03fc4bbbe794d54d57fa0163a4a6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "310848bbde309c7361c304b3c2b63ba56fc1abc884db189dece7ee2791f0866f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "8286823bda167201b520e3ebc972fba664189f4fb2b24331cbcc63d0527dc825"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "12784806d4897097f720805cfea0301a79f1c545e5eb6c0d7386d71c799cc8ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "4c58379052ad2e52330ffe3461cfcc03b75fdfd67e5eb46bd73b87c82ae78efa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "3e91a462ad16288a7424933032d6abba9043448fc02f926a2036c7ad104295d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ad8bab4fc9e40a9fa9380bebbbf6d469cef4a9c6a460b28d4334c3e911fbcfa7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "49cdd07ab1ed67067e34d143ba69b8e13ba0645bf736c76c27c9777118c0cb4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b410ad81ea741ea4f5549fb63704d3f0c8fe6e1d07e427b8f770e25c2c592bae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "0173a77d8238c299b9a72eb3b8ea815997544a605b473a3ea5e07dd52116c694"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "6eb98b372ad587a74a8881b076bed55ea80c62f33007af8ae1f85bb6f9477408"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "c4160df0411daf40eb331e295574800d7673b9b905358ac7da50b4315608b968"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "cc2a5375a6140c1c606dbd49ce32513d8ecfd0ce4cf250137b6a8f4226b05b5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "9b22457abe08709faaf2cf62869b91f7152247f1956b48326523d76c48fcd7d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "a9a5f9d18c2e8ee080daddc334d59bbd33afcdd8465833f65aaf5d8f0dda3d29"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "aeec086cb15ad95eb3d991a44c7857f150f4ec24b1d480a68ca75bebf1f867ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "f7afcfb7c7018d780ffbbcab58edd5720fbb431e763381bc1a6b44a7e11fb70c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "32d1a1baadfadee7ff40233ad09a601d34cb7f954420b275a150445cc1cbdf80"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "2c237b680748743be0f1ee050ae69c6af9045339b864f9d1d9762ba7dc8352ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "44a8ec11cd2e401507f721e590b152f2638891d441dd0d203f5de93a6ed34214"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "44fe5192eaac0b327915c33c25b8aea250234bbf123fb134a255aa749362ee45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "a8838bfdb2bb5c4ef675dd998f65ab98ef9bc5da6b5cc24a3c63b40b783df7b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "e726397c34ac81fe7d3ef3b639869f1a5fd6366f6fef37b5908a8be32c200fe0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "7327019721c8efda13662257de71e9f46acd60656782db76ca39dc1973660f35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "b98440334f08110ce9e8c7fa08d7ad8edbf295fd118bac9db18225f8377fb8f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "a592c5d1e202cf78f1f58eacb6e52c2c549927101119b1e8f7937e1965dc4d5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "93199886007c7e7b228a0928ddbadfb75a6ce87e12240b9e78d115ac0f24696e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "43b45f995cbf0a9151a2fbda1f1f70a3c4375533b37ac0b9e85bb1393539baa3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "ea6af9febcbfc6213696ab612d9158a494c1aea6212d2b4f571f67616e0c75c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "f49153033e0bc19f9acf1a0f6d2c6e6700635e3226f70e9fba0ebe282b6e7d6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "e1ced316c294d354c0ed40e0f4c646b90eb2194ec5be40d467985dbcacf96c58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "4a9e7fe1d0f28af4d094130474b0aec9a089237ac1225f28a1938b71405de037"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "1ac4dc718915fa07afe1623484bac5d8cf6ed5318aa73c6f6471735b5f978887"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "2174edd4f711a0fa66fd2064e4e2f20cc2259dce92c4983b4294825aa580dd67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "789d262e23c1b40f27fe6066a911c92695f136cbb4c4363f7d4fdceff4a5ebae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "e3b2e8d11bb62d069b2849950955b590f0f22b2e6b5f6689901bc758369c8359"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "391c75558653a628773f895b6376fad3d6aefbb0ab0ce46bf160932734432325"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "36fd82322761d686f48b2ca908c668f5d58b930107550e6d395ea3826b7d0713"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "9d7b9a5042592f5f3fcdeb4ad60beace41121685e757546d502a7806b40e4c02"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "e6f78e603f92b26c8013b0cce8204cee7c72b3fc3390ef95bf3dcdb4f4b4832e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "7f67979403e3ed72f1c18eef4aa07a409dee97f98584e92b81a942093ad61f89"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "0c11cde823c38b0cff2488fdd1f7329afa99bcc7d02ddfc9788a063df9324c50"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "049f51463da3cae2d1f006ae0ec1180bc496e7ff0fb37b884ccdec07af5d02dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "97cc54891041d4a071694b9cd212f5382ecb69cf34a67daec5f63ed04e944337"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "139bbf203ef71ca3df2dfbd96bcacce8d6ff20f955cd507e2146ba0c0408e0a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "3ee9483f4fff391e61a7ab96c73a461c91cf32ea65fc22245f930c91d9a01544"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "bc22c56256999e97dfdd6f34470406a013b2ad7a9e4b413e7f9b3bb800212091"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "39631729e761d5d4c79258c73014a89a1143a37b576cb71ef6bedcd613eb5c39"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "33ba7241eb630b96c9e2bc1423068b7ce8c94a48863190cb2bff86e739753cc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "a6863bd263952e26cb9dcb9e81720316bf2e34fa6f764f4aa07291ff27058f85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "fbb8c56365be45cdf42e41169d00ff24c6770b8ed1570bd65306d6fbe7e370fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "1352d7c54ff51a6779a208a1d3163967c9ac62bf31d1f1491d38b3341de91f8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "d228c07b5470f1ec265a6e8a7d30909bee8837d5db3ff3c845af2a5af2504db1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "947decc41cc8dd533fecfa28a84d3faa730e2fc555802729cb96ef74acc8fe8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "9fc3b0db752d39cf57eac04eee839d68fb6f301401a65919bec71d54e8bba05d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "efe087188696d4eaef8f864f793b2b6a6593540a66703d0d410032f254e478b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5bb128092cdaf02f861f2085738341f02ae82da06efa1a6e2ddafc9e8bf54742"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "a889f959610347d1c062139aa0968e671e41f400d53c935464046816fe78f45a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "91c20e71a5263486464a69026673ac2950b49d4710666506004178b5b500a13e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "5c4133ce6e26e7299dd88f9b09b5a7b543d88faacdb4ddb9ae657511a3631c96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "2de271dc40d6cb72bf6a1077e4f1716f93abeb3123ca64a7de70c2de7f71b990"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "d2b5a15795f8ba559945c47fa0a63150b7f2780df740f780f295a4f863686ddd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "962d095dfff5e327fb0468a04b94e5d67d64aff5d60bf393508e3a14569e09d0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "6e2d898a9b2301d13b216fa8e3c8c9598d6656ed3d95ae93174f63f587ea7611"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "935a4f10aee9f2ce920c74b88ae203babb0d58c147d78e40e384f95e4d41b877"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "f636f951f534275ed148d9867204e8b0ea8658520bd54e42b372d323df86929d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "957ac184f5648c248d6542ed17fc3ebebc45a269c22727833982d53e62e0dcc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "fd79803c97ea8b1e8d15ef483061194e15d7af73694422ec6cfbb46192d50a75"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "041bc4f4a29f1c75b7248a9ed259808e7087f3b9416c688161eebcc4a932b660"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "9be8bbf3d9bbd3ccaec44f85e97fc0b4a264dfd1202f80827789af5041af2e47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "fafc69df6dc85db7711d8fdfdb8440fd36c64ae8f2f04d41478e533185685996"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "f611097c466ff4fa823733c55c522818d319e8831edc7843802fec1fb97f2aa9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "d502365372e9f06d670cbd4ac37fe2619a2a00c183ecfed01037d48b42310d20"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "f160d05c74c94507fb9e33d239da79f49a64250e946e4cc70b5b41a62eba7e40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "153b97b94f55f64d736a05a367657222afa5448818d4a7fee89434a7894b4fe0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "673a4d35d0ece8297cf9a406d92b8597a592f2fa410a0500535680668f955b73"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "dc1db63c0f6cde3c299fb0e6eb64ec9759e768e4ecc2c6c7463198b67f3e7b34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "32789ec3033df646f1773869784ad14795347fbe65315ca0403c7e20afee0dbf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "63a7516d951c1e8a8b01f978dee4bd11901aca7bb14252738d1fb32dad7dba7c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "72831f8fbf72a3ca3d7f37d88def35a3f2cdaf5ef9144a60076715f0d3d36502"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "74d7cb807886c0acd2231c8bdbd361b6f160d2a39eb200a6a539ec4f0eb9275b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "479e6a19647d377cee0c3127788752881cef8b73a434cfe2a0ef7cbdf1082b9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "0b8ca02e970c927928e216261b2e482822ae322dea00d46293d986078ca11d2e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "f79af3d169540863b15308a8f0cd7e38116738ef9a72beb445b2111ed59e4740"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "4f8fcf017fed4f17559fe65de62b00d2ce570fb4f0b4b06df104361210ab8186"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "5ada333df1b541e0adb701a6aeb8efc020a7060e73ff9ad8b1c4b189b2d5df51"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "727f4a67b0b1d3eeb925d8d225bd4c7d12c34a4835d28b6a7818c0fb618eec66"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "94d32fdf0c71ecf1fe5d6139145a297534f8941da7092e4caf948ffeddec50c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "3f6a5d00d0511eda5ab7d1e55f2ceed8bb74eac6d4e5891bbddaf3ae3e53158a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "6a5e473cefed1e953cdcc6415ca9923af2fc222d0cf784e2ab698c7b69301865"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "f9108a7a3fff242b51d487a712045a1b9fb0876496c8be8d5b0c0953be0164d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "4befd4ffcff7ca8de1d41310e923d11ee0935fb740b6342c2d38ca378206b7e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ea20c7bb55a5a9e12fb45451c2f8ff01c9fdc5588ffc46881833a841db647732"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "d6c472e17c7548945d7012337ca7be3cafbdd0a8e7abeed31227961467be87d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "d89892244ff4d21fed02fe45c950c0943ae4355c01f377ada5b8cd4087c89a9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "65f4e9f450919c17b47969b979ddcafdb1c6bcface0a224d9185aca859009f11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "3518a5a217a65d8c26f78c23bb502e516c995119f65b1995edbd42893a264546"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "f727b26afe130dc710fe3598f03e3fa69e8d1203bacbdde868c4d39ca8f1cdc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "a543213e05b62fc498551488a19cdf3ed7f79eda77ab9f2d6490688e6ff2f0fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "a14fd4b763c26d11db3da11b6b370d0a055e246a238cdc709a12861e574916ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "452a7b3e3ef421a4f5c422fed44f4576eec002f576dfdc7db13bb82ef5fe5fed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "3090bec3527cf6c439bfd0ecf1beb56a596149e5bf9b4780265bb06f121a8177"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "4f30d2ed7143058a6db277bdc32278fda4a3099554dd78d3bc97455469b0b2eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "7f4912625d1cb2c81602dbb3f191c94d4e6332bf3634d3e65bdf70bb9b713170"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "155ca5b4868a2740817280eea0c92f05b3f3fd43d1dda4f12efdbc8f833cf68a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "ef550cf2ef0eb3b4548151178f71401dab7613bb80164a549ff79362f700399e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "c13354b8b9efdee5f013f063be7f7141b5466de0d1db20714860a9c1ce7d9721"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "d4a6410c3a1163a9875c60ec98175b22ca09e736c8396e139639c2216581e6b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "667c57060c29505dd7a37c64e5404990c21dbae2a0216080cd10e0a2504bc936"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "7d56e04d886785e08c645ca25564d45d5f52de3b400c4b3f16474430aecf5733"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "166dee35d63a96eac46ad53d577a2393d74f5b7263794eeea136b989b015d2dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "21fcaed9427834384447e82f5440113860a93a97c1da008da17d93de3a2dcf4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "59a392ca26bd31a3d0e312fa65c24eb6ee205bf186b01147436bf53a6fc92ddd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "53a915b1f66d828a0ed94d8be23c842872f16372c6e9c3aa44a2f63e8559dba8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "53dddd8d976682e0d3c989933ffba5c2cf6785ba70450fd05693930bda8ec489"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "5e57ff573b7dcd2ece5bc1c80177afc6a4cedc5a69075f669641bd2d1f0d9a3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "4948b4da2b43d00868db3beccf02f3af6bebff7f3b23fd99bd964bc243a57149"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "9e277a27c8134659216fe71457c2313e381c74801d8d7460ae0a4ab2dabf214d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "70874ae517cfe5082fcae80377ba2a38825f74c32499c4856621235aaccb0d62"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "0575082a38eee72842674280705f9361112422bd8ce42455c88670f09c95099d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "c71f33b782120b552a3f47fdf1c84af52ad1962194b389bf7795fe281eaa238f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "466ae6a99e1b63a45eb410f60675cb348536196dbd19cc2f57c477a3ce2f5db8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "7325eaf4009442f13ea08e488f81c69d5c349edf7a090c2e18016d3bc9446046"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "d6f570bfdcecde5e6f41de3710b6308eac6561712d5f6ffe28e5592479817ffd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "f4db18c9dd6d50f048fd39a6f0e2d8d555c688ce600605bef5859f8b0cd9b4e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "3ed16368b09775eb98bfadb821a1421aa36b9218e942c4611c830519e3e9d5e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "876a8e44bc81393623b8c584fe6c25e39388b62b551480343df51e568d4775de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "5cbfbdf283f64d15c27ddae562bde07bf8ee2749ffb7b8c578f5d971c6a21b0f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b1447b54bbe95130244b1a6c5ff2936b95950d30e4ea0f0199cae876fd813ae5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "48d9da6a4761afc8fe7bb2499f8fb9de9498e2de40f2f7899fe5eb842862680b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "a51acf47e2bd2f0ce3b3378abf5c29ed5a4fb5aa27937a8f864642fd2d291321"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "93e33c074c846b367a757f969bddbd89a80719d1177afb594c65583b05adc6cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ddebe60e0f716bb6cdd7c1b5b880a64e15b5d13d3cd67cfa076990e256674742"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "54c6573b1f037fe4bd51c1282a4b0a4e8311ee3bbf05694778fd14344860e762"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "98e442675191a2eb800117e3adb824a97f20bce2f534c740d9244665ae91624b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "e87f6a5990223a7b7369072b37c2bf8ff0f4485041befccd9c5805cf4b9b9002"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5432d637d79f98df7546953584898d35c4e299bf4ea93b9280162e0553def2f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "37a24826e1e1dc5754efa0030181fe5f80c226ab57d12feb91d90dae887a6f17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "6d845910bf3d22dcb77a3ad3de7bd20061b526e01e2bf3c3d3fb48e521e98449"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "3f9d2a871346151cff5973aceee37cf71a640f30655de5a53c806c8034ef29d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "796bd1afb9d61d2512763923657d37c3b84f8568b077d024842d0c652c5ce11f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "e7fa53eccf70d0cdb232005f59d49ee2ea81a3090080902b431af0c0b275af8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "982093038e061abe81b65096150c8b69c2bc64d83b886c369be30437ea0ec6be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "f063c3a13ae934d70e38675a3eb71a56eacba297b3a4f5bb31dc3e51b0042eb5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "1d2b19a19287e0bd7211b141bcf12998e82fa005e0c000074f99b210d4ec8c5b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "69e829d1776f638650673e84a0f359552120734fb5210b161ac5153a83165de5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "39dd3b4d72b968d1a8afe983a465263375956263656706ab5519757dbbf42879"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "257ed4575ea76ec0f5c19a1bb1aeef121a21c2fa9e69cf72fb251fb391c801bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "f0cb47e14d939de05a6bbd89371c08aeb3de89355da41b90d418d99bd967a1c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "1b0144bc45fc601268e7e45bd64a4e62b862a9fc9611df7d0e7c339b9b2c1ac6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "7c4a7c472d831618d3dac3bcbd989f6cba5a9587d7eb5bbb7bdc50c6dff9ebf0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "6dac5099946d86f3154146268a07011ca234e46595a9b1aec55f5e79b4707590"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "5e92abfff29a8e57630e31430ae0e5355de168b440130c8fc7bc88b560634017"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "ca27804fe1d9fb5f80dbdda141c1bc24c2c6e76542db43e7ba331629769ce76a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "b213cc5648994a7c586377e01e03ba8abce1df7a6fa31c5b7eda702ecaec492e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "9e88010c11e4c9ee0d1a5dbe0c18a9a270293e2fe879afb4ea6e3d5d996cc1e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "df1c0f9f33bcc5d7d77102e288b535f4b86380892eefe01970254f5d02ce29f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "d38d07ee813fafa71f74301f68f1339845b216609f123a4f7adcf0ede78bf4b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "2330364d4542665629de4133a9e2af8b2a1371e897c12abad4fa1663139b7062"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "53cb5402049ec50b4bd06045a431ebfcbef92872565be2961fd40ae5e8a5042d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "b2dca07f2e145b8b78b8a1a31e7665be1b2f6f2a86d4259b8ea6ae680fbf3728"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "dfce7a14d6ed6ed479b791b002261c22c6978efa33d016c946438a1522da71b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d345278f0eb17e6f824a9d5263c18a2cbfd4da5c7c5cf72ea9d96ffb38ac3125"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "3729f54a6455e74201a098291e7b430555766cb55c988b5253a35d4938b032c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "ed362118cc97da36593002cd00b9c51eda04d71de1ce0205e3ae091101a61043"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "3bb1ed5a3e2bd82d45a57f84d53c5d8a239e2cbfcdcf9331e951c87a6dddcc99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "b57b2954a1715f2984e6f1c7f73c8e3a0061583a3a42262e8155b4b13f9bf73c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "0d4d5cbd4840e8d3217cd3c513846a8d7e9c42a7eafbef6799f1888025b10ef3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "b59a5eb6b6c78f9567c1f77ddc807b0ba0d673be585d1ba31fd28e56018e0f7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "4c279bb10d50abb62a86a45a409d33973e73810722001d3d75b2a8bb47300fae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "af9fe8d17ba6c16f2cb8857c265203fbcdd807bfc8781b14745608c8fcdf39c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "dbdc7d2e5c09281776d7bedb32c4f68eeeb220d59f6676736b0120addc6db1c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "90d0e33c1f0e8bf44628d7a35a602ab3fa51d46e6b31a1ce3ce3e5ae12356fdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "e09655404c8ac95cb2b92fc5bb20c3e1cf6eab250dabae63e04121602a94eec5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "0de6e5e65de13479706d8869f0c49efa23259e5d87acb2a07c44954777d59a96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "2ef0e119d181cf6d944198ec2a44a9c42e78a9de591b78e576f24b81caa9765e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "08aec7867cb88d82a633f30073492f99543c1f7151e5c1d25fd9e213f879e3d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "ebcb59068d963fa8e620eadec483553f22adb26ffae5ee785fc4f2c49a71bcc4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "01aa8425c7a0fa9d42eeca4bddca419a18f7935c251862c5e650bf2a24cf2303"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "f3748637877b5216b6ee19d8a27a4dc39a7a390aa9fed17c91a68605580768be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "d45a45130b49e7960e3841fcb6d3674642fec8a2fb00f97f3a7ff631a756024d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "d37f86e648d3c567b8074c00eec5e55b51d5ea371edcb62b18e0c0995158e475"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "44a5d2cc28fd96c3c40323905a6471d05062064087d36940e054a71dc4ac1641"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "f7d2dc8a222b17a4d90cbbf3d3c9041d0fb121c2fe6c7fb557a80e75cd4e54c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "b6ed474bb5230bf918d2cdef1a38994d7ff8f368ac29d803ff846ffac9021f1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "8f13c0f1bc094892975bd50ed8f3e05a580dbc2c20721e094e3dd68d68e7d644"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "3f6eaae68c5e0cb04dd960c05fe49f0de86761a485a61474b4acfaf4f41fd1bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "5ca59fabfec5250566f38a7e928158e2911a97bbcb7528480e28b6d3362a1898"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "5d70ee69ce40acc5b5c44aa7e56a94aae9a90a3b711f28040db9b613ece38946"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "abf33cb39149fbd37a2ac19734632373f011da3f820ba6e929bee34467c9676e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "e8750f50fe73ab8c50a1280be705d8e31562afb38fcfb2b2de3a1fb5fa7c444d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "11e50a958a78e3c2b12d33188554da169d766da35009d6c3aa41bd981eea2e34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "b66f1cbbff16fe3260f10dccc874ee8b5a49c942b6feeb939465f43165256d31"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "9c58f27e2a9ea9cdbf31c816576c450826db79402a72a09075d6022df025568a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "4a715e6e48a8c0d8c7b3f8e1a504e0c4e72e28ec66439bfb0e5c07479e54f94a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "3a4d29333f7d9a9827f0a045185c5fa929fedc531db20582db032b9c3238e012"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "6a521c090593bfce3baf041188429610c77fdd06d27bc44578f559c6398fb381"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "6ee519536873fddbbbe32f3b2a6b4806feaba2a7e90a4a699346a9c82d08af70"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "1d0e1eec3c5d8134e5aa3cc9fcb70fd66742774152502104194e38ab8a96557e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "2fed6b51095e871ac44ea9d4569d59124f329b66d2e3cd733734cd9562535508"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "62e01c4b33658c714153e2a2da3ebb3fc4accdc455f1337ecc4b47a6f7b31df4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "a63f4d1a972b7601b2ab81920ea3d65fffd258fb759ff56795f702cc22df83d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "50718f8dcaaa5563f39ee03483b8f6150ce5d2c50b9289ff7a39fde3ef88d55c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "7840968d73046ba1f76787353546e2ccf6baa8b5ee386f06dc2e483fb2d47520"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "6556cfd5bf02d618abe245acaa84fb8f9c474ab2ee08e532f03a7333776912c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "14033c236b214d142dd2c5ed17bc744debfd60407ea8b8221537024c9ae32761"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "4a342ccc04ef6980ac241343744e30b7db448375a671b1f72022728d3c1b731c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "0e73f3e6306d2c17f6092d544b2d1ec3d407258be722f1d4b9ac00d45b9694d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "12f612c518679afca1d6c51f0c50649de98494f53821c7ecd902a999ecb15212"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "ec21fe53fa30d413868dba5191a657ec5a2165a84584e167169bab94bcce903d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "7511e69c27b07dfd0a9bda7c1d185ad3eab2e55ec8239aca2dda60112e508eac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "fd2e331913fd60e2413af2558a464d8d758bc0a9dd4c2d2ea7d06d5676c492a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "c4fd84de554a59e428ad28356cf8128f833b6ba8fb33f3d5e1aa5c086c416176"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "ea4ff53042bbe6299b66dcaa22b3635e3f3dfc4146c1a9d7271c2624aacf585f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "ad78412cfcfad128a315dcb4144fb4056ae409e807aece858969876b0f870b65"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "a754966fd77391d5635bbf218c47956a2ffdb67d6476c8948bc6931859d8d36c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "8e5069dc4e8c41ccc46ddb67aeadc0c8922bb7380ec1556169b1717abe2647a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "20818ee2b305a94f3ab8fb3ea19d22c7ccb41b6db36ff45c394f96da3512a2ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "3df4244778b00658310fd9bb41504a835e7861c3b9a881da9ed87f76b8b8903f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "f0c95b052bb25aaac8ebbc3c8d18e1c1a57c60862d621039949502de162c230b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "7ef9156487c5c199728cb8a59eb9ba01a4bc98425a715691e7c35875b99167b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "599395b67070cd840cfbd4d99bce9b4291a39863c163fc36fd35382352eb46a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "6817ac9b8d851b76392d08c96beae9d6628da11010c2822e513dda300b10e101"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "ca7b3720e17559a405ce3d0f82624c7bf2807f17d88e2a46e41fd75edc5e1152"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "ff9312a3da7ef1a65f43e4e15b0858e2ecbc78583def64178e85b9d67baad98f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "61e27975741c051dccbdb7af50c4cfcd5cc89b6553d54da2f9633a571eb09450"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "9ee8db69d59239362be6ca3d196978e071dfd6f2fefe7ed61136d3c13ddbd579"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "e3098c7eb4e9f8a1b581a7c7668ef23d569c0c450a97f5d43b1e4684615a9a78"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "8e8871743e2272c5a9d5e84bfa35a2996753fe1fc10960d194cca556bf267223"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "05a0dc4be23af799d287fe2f33cc47b7063709a71f9a979beccfaf36fcedd611"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "04428a6ae3c0f2949b5c64ccc8e76f18afd00b7710ff2f743507d83136c022b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "cec96c63c3ea574c92dbc8b6ab7078ef7ffb5978e5315eef17b3492c2914e700"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "ab73a50d8aa0fa3aefb107a71363cd29d45d1f0a09ed29e803dfdfedecb98e28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "7f83adf900c9e8f9cb7f261b926b6136de31776b3f501840ce3b1b57d744b273"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "e479390f088d810fcfb2e3678dd51feef345829e76ffb03f303a75cd911218d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "1149f2b2e07463116b7f2d767fea7de1577cbc488e972c1262dde7b8414c3653"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "0db2ac46057fc824ee71a69bf68f207eb295cf0d0f6ddaf41d5581f3d7dcabf1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "b104500313217ac1fb1e9c80ea2293d346f339d6da098bb75e3d2d5373b4f619"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "0322dda254b7f68c3ac4d9a7329820e72e6aa7d1dfff4b96d851d665afd1d2fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "9b2e8dccf764d23538a8393f4cdb8ef54143d6067ef1f6dc4702ee2168a380df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "18e918cd05b77f968eb4cf9161c13136d0ea0696473c05f8cbf2dc034baf18dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "30ab7de7031b1507205ea22ef84a9b80439101fa57a6cae0875753c91c7b5eef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "55d5c5defadfea6596e8eb8a982633f6fab4ac1f933b3d5146e545bb9f0e1d79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "57105eaf5bf47f706c6d9208c6d462e214e052299245f4cca50c431cdc53a431"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "cddab1e741114e1c0148e62a2cd2be7cca62e05ff4cfe3930bb5df216997d083"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "943994aadecf046b6e503411c7be7648e6e268ce0fea8666fae1f3dce6114d72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "e9df3c5a3d195f90214f04379650805a000c95bea196f9f370f9cfc5c699d9b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "ca1448b330af08c7b87a0e4a2fdad246fc5cad17905f88818e342f087b979f80"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "a5c7bbcdf9277929561e53e552eb4227b073ae359dbb661f0d9af4a48e8bfd9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "b6f0e408e5b317c6561463185b7c968a409392ef4e901d09c060ceec0c707c68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "4c2fa2a084435831d715abc227139472493db3c7c7b848e02a7dc7f40657497f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "cd107849e908eeee1770f4defb77a4893c7330effd4b3f12ca2fff668bb73390"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "352798c045418f6ee61afcadc28e79c01f62dd6f11efba0f0a6430e443faebe6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "b75c5f2569607521af6900e6247c1746fa419189290640ce7f9abb5c14873dfe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "9aa822e7d6cf443fbfe20bd2309681371d65f8b825fd69e95589fa7eefb1cc6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "c12db3634829f5cb58f2aba05d5df34d05f1fb8eb44a7597d0b01dcd0fce8e63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "ad70841e38b9bd83e26607ac20f9c39d16eab1d7ce8d1681ba55fddcdf173c23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "62acefb3e44397f0bcd8c4e35a2f235c76223a844a57f455b16b7f0d021373ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "58dd3e9ad58c8d14df6350ac7b88d9347269d3140d0f7882a361e876bb4e2593"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "85db7440ab53c8688840678176d1393e8827563bd92490fb4b2594d116019052"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ab17d41ce84e294cd6554d7b3fafeae1d9b9930ca2999a121038d8e7983335f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "38c8382dfd769886775a30e42b40bd9a91a92ca0ac6d0713293518491927607c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "005ec6b5811584d41f85409d8c1ce1a9202d8465e08320d4edb87771ae75d6fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "83a0d221129f4daee8ae964ba00be0564197a0052f01d2d84ca004d4cf498c15"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a45454b2c97aae2baa8d5d28f4c6d28ceabd5259335d79190e01d5a160772d88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "f8a5b1358ba2fdc769dfc29d613d6c91f376e4c57bfd2056136647ddf9f5cfe9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "ef956a8097d93f7b3b2066b2e3e8883e83dc72d27fdac93c6a55cb0239f7a28c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8ade6c1c0537542b4c75991d8ebc14b438309c677fc1dcbde4d0d5c13fcc2a83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "2c17fb566f3a61aa490131518df840a41ca7282ee0207044db6f698bd5ae701c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "9f47a3668a7311bfaf0aebdeeb4a23bbf56e044598a74095a21c6f0ecfe6cfa9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "4afd9193e3caffe04e0b0a593b9da995554e32e4faa9f6abfd10310f9aa48493"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "15fbcc97c69725487bdfdcb06dead249cc2bec8b521ba82b56bcc3eb1cd6dbaf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "4eedc3e16976da2749f265f906a9803fb1344b82a52d6b6e77d4efc29d8fa82e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "7052b59b368b637cef024d097f2827e232683af2e99fc550b5d77160e610d5d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "a22b71a9959eeaaad6cc41c1f9272c990b4d94dbc5cb493e35bd7027997f0e4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "ad62a060105825a4f5f112750a75cde2ce9d9b1bbed63d277bbdee1d25f40a6d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "fd3a0523394dc7c50a16cb0d02775f0a17a66dedc1f22fdfe13fa608aa091237"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "9105389638e454a2f311feaf3cfd922fc0dcf5f471b7aa058423cffcf56c3a5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "546855995fbe1325d5867d516379b279410f2dd4243fd3d1afba3dbb3871e9b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "8aa011cb57439e342e800434247603097fbd276e2c54e527a0e01802f4663f00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "52089e5ae60e1c1bfbf7aa94c89f4cdd676e7d7408589526b82ce6e6eb331e63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "468d694b9eefd57fecb2a27fbbd59bf7c7db6d860a18f5c03d243c3499d9e189"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "5a6c1c7c58740ab613296b0e283acfdddc8cda5f625710fb364dfc02cdcc2511"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "36b229612ef9c279b18dfcc5a58f10491183f4474f5b4b8f76dbcc8ed0c69eb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "c32011975cc79ff713ecf6e0d95717c2acc1bce9a5cdaab5b07ef48a52d7d94e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "519f74f0cd935df1e564323cc4d9cf9465ed9a92ca6597fecdb38583a271d9d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 209176, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "df5c24360dc9335463941a95ff4ae2179927539808c047e248a3a8c0a954a410"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 209320, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "64d65a81f25bf97778e56903cb47038b230b1ae20d97127c53937716df55ab3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 196248, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "616a4953a71a0bd4ae588eeba9ffdfa890e439feda09f91583e0e16ae3e4b30b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 196392, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "6c18faccfc4145d3a43251de7079fafdef8cf192ed44337d34d64a68d804ebcf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "a0c11348938008f9efb801c63ffa4e56528c8f405361578d4c06c9f0f53e44cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "2c885b688ba81278386f4c31246a99809c899b7d8a46a10412a00c7ed00c761f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "38ff154a708c021e066e01bb629428c9048f54866452cf4d5d4ac7a1a9f9bffa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "af9fd855f442382af2dc0a80e94eef27670e98aacd9cbe5ebb3f56aa1a6fbec7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "5c6c139a660c328859bbfd06f5998ff600ab0219800d9f770eeae688cf07c8c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "be5d4be024d08edc1f8c164352eaed3f224c2c69dbbfebdf8da7a38bf7116700"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "109536c943bd79c36e0538bb9d4154760692a801a955c8ae5f62cced43a73769"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "5e12017c4faa1c123907ce636808fb5e59fd3449cb3ca241d50f1b52fe2e51fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "cd36612277f2dfe34be7397ac7fd34f59ba3cefbcefaf7f444d6079baf1a8589"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "08a75894bd6f9008715ef084115577efb163b3a7ee5b6fd5db1119f39fd7d750"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "622ee0938f07eea02f4915067650994f46fbcef92ca978e1384f425c9ea8c393"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "6eee74d3d1d1bf763d504be43cac71217dca0007da8b8e595017f0b6db962839"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176624, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "615b9190237eb4e6d9e1db70b00266baa703489a4f4efc1c96a2e88268d6d73f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "7a0eeedf1e5da2a1c38d194fc060a5215a4e116fd64e9cbed28916ada1bcc4fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214336, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "954baebaef930e2f4655bdff7b08bd20ac324a3958ee329e89fb4f7c242efa7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "cddb024c2b5956772e89e06a71fc0328be2ed83b8c8d09b558065062821e9a90"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163680, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a00c32a9bdc41f6c3505f7a9c5fbc3e0342e426d7e6f8a7ed80697a7e206a2f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "683979454874df9f0c591ab0a93e700b47e55b2fa37196aa4e8d2deb895b663b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "34b863a8f98429887cbb4c5e373837ae6902888e4f14285dae7d2feb578effc0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "f1861383bbcae4fb8f19492aa28406245cb537c6157a4b80bac18664807db7ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 179648, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "afb33ea260c2c28a75c3afce614c11ecdb201f132f81dc349c3238873ee55b21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "4f5f5126ea153b7ae44bd466a087e1395d54efeb526a0e739dcf5b268ea55f23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "393ef25814beb66bee8421b6e15399f75df982db6cc0d79ad22068ab2a9ad516"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "62429d7f4f9450dcafc6ac6f080f66c39644e5c7f149ca8c06f8a81c9500d37a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "7a8898444b775c269cae9a5647821e5dab5dff7f73ddd646b0970fac7619608b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "9e3a09dd84029a4c9e6b602809a12535be160126f68941d2041b4c53b3eadb45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 164672, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7c73ad45dbff35f34a9dabcd31d2e2564de8daea378aa1243a38ddb2c1557461"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "4c2a5a92f0fd22a2ba661b6d2c0f538b933c44aae0fc45e156d734e09750257b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "f76274b4b7701ce204fc1be8833a98cae927bcf7c0b6d90899a5444e5f827227"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "683b45ccc9fb43103063a9826db5ce87be5c4d0504a08b86678d0ff07d316969"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213160, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "48cc194c20414f0ac996e7e8057f0dcb86ccd60e07ce8e9e645e19019e84598c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 200488, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "18a607b120dcba446af6d6d895a88de0e496ed997451b8c0fb9d5a05e5b35517"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "ac74d35cc1eef7ee7c4e30ededad8d612fca9b26afe00b954adac6cf725d4cbe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "1f193ef0b94a4359b075074b7dc9f61866b7b5691923127cd84bf9f649f4f173"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183632, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "9f32c2609521d92e02e6566f14b1a4a8ec86b957d770ef65ed85be6060c07286"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "5224943bb63c0a2bbf4c7c7300b5d8159ff36d40d1c7ae12c59f03b2a570f73a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168912, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "1ce50c3eb152ee564c85110d0b690c271b2d32be0145c291c8b7dca3855c8061"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "d7e7108a8df4f35c2fdfa82ddb65ba2ac2cb4bff6cc71000b76a6de949101c0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "183f82a8771f66ef9ae8c51708d7a84ad571b7157833e4aaeaf4cc5e103dd087"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "c421c421ecda45749aba17a21198580b0a3c80cf806ff11cd8f9b434cf8cffd0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "5e4ed42021dc1a1151b02e0c480c96fb3f81e17023bc4e5814cc8937fa85eada"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "369e66114b48e15736ca5568aef14620afb2876ca7d7da55e186103a416bdbef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 208664, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "4d4cc7d95a48e439e394f557fb72d2859ec6cb402caa62f2651b93bb7e8dd452"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 208808, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "39a32babce9e5aab53e966f0a889f841a56b97c1cb58efe9d6deaef90e09e06c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 195736, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "1671dbd5559be50d5f02a020a8ffcf2fbf702b21292db512e37672f287c36f6c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 195880, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "d93e5da5033efb764dde10e7a414e8bbb82421ce3de8f336f2c79c1955d09943"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, false, "a30f28c37f86fa8ffc63a9693b8cd3a0fcf7ea3cc7152ffd21b01130ec4389a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "88706b72df6ef983b401d34981186dc977ebe18994f033852398a87c1c2b92ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, true, "9df7c7b2392dcd1d0d28dc4861296a122450299e100e39fbdd6290dd69d75096"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "70ad39d1791ee999740d2fa28992212d11408eb431fb7f0cafc04d60798c9a42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "2ac4175a51d64be2a89c3cc68a35520a3a412ceec23d4a83ba7b6e175df31227"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "f880ca51afd6164e5fe3295ebbeeebaf15f1b07407f01ed6c7fe4fe1ac96cdab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "9db636a350bc6e705d7619f162830a41596fbb4d24769b7202bd73933ac1ae88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "41bfda0afdf8955341874362e86d22c379f5b010d0cf468927767298adc9b9c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "008132e502b7e0cab2522ffd73be120f510b397a10a0fab38cf936c5ecbb9df7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "96f18b116497588a0433dd48fc93349cf7e15860c7d676c930b6086e2dd1fe38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "bc6625272139108f7a2f8a8c30b7fd0711ce47174c47b73bee363cbe2ea505d8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "a9e5f8328eb3db491563bc91c445128a3dd7db364b094cf796a6599db156c2ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5c7706f0f720d6530d1b696d412daa6f6efc37f3fd9b8a3a0a91afbf361469b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "46ee55326ad0d3e5bce047453c8bdac0891a548aa475dd82a451b309282983b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176624, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "115ab03f7ca4e367ea5740137a7a718fea1aae5f67f5c45df6ad6b6620842fcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "8bdd479a33bf0c79613d434ab2bed8ee95b939cac061982b9de19620394d1407"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207608, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, false, "f995d0c7b9de722e693a0e09eac55406f4ca426b3dff8563666abc54d373df5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214336, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "6e8ae6bbe55e8a9c247f567a71047f6d561d50c9fa0a7bcb7866ee242c7b06e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, false, "3bcb5a155071f8d5ec73367c3f2a1870f9d3b1c470eaa667e9c515fd1b868a2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "fd67bb3dc4e70657f205f8fe3fc16babd15be1f1513be5d3ee975f310e1185f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163680, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "760135f74fe808df8284a14e4aea105aaa0df6846513a4533b71b218ab603e38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "44b303f8101be4d5e76fd932124210d5c592a4a00763d31c7a942e9142b65686"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5b2900cd4598136a93dcc13dfc96dadb93511878c182a5aee5dea3a6148d4bac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "9654927184de874a54076ed6eaebb058d41567aafd1332631019303d2a77a4fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 179648, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "0818ceefcc7f8e9a4500f239faefc2225cf5c1c8de4ec5e741ded9d880373347"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "bf00541f490b9fdf4c8d68bd4da8b10d9783263df8308aa1a6722b963674eca1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "6803e7a100a461f296887c112084c5c63a98d342a08dd0c21c4c1e9811b03ad0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "0e70d50ee9579fa0cdb3b1d6500ba876e271b2b127151669ccc6706c98a34537"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen", 207624, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, true, "895e6d3ed5e0ad529eacbddc870fb4f1ef055e9d340a9d34b43e952cbbf5257f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "6fb98d818e8908cd742d720020c8dabf89af34f7e710fb42c89756ef2fd3d9ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, true, "122b7ffaac7c32695a9ad5bd41d4f12bd56caa0ba862510690995895b24878d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "bbfc05b685f992327dff6f90765d2c055e7f86d43fa46312db777b7830f7734a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 164672, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7b9de3f0c8b3057f8b128b06f4bb4b24cb36d9ce9d1f317d34ff8cad10a989ec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "b25b9e48f8f1d925ba647d0c2a8c75ccdae537ab38d67acaded42bfd7a61fd67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "d56dc244951560d8a273b8a47794a579dea58337a0dcdb8dd3be138a48f81e01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "1f689f704c61692fba34c1ef5323272bdbf38909924167fadb61de32d985e352"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212648, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "c888ffe93c1587240f0d419b1fa5dc46ea32c05a5877af983a3969d29f1c4444"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199976, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "bf0b9091df5834638799f308ce71e82c570b6b217ff3fdea771b7acb879be108"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212824, 512, 2, 1, 1, 3, 0, 2, true, false, false, true, true, false, "1920ce7a0b986c20f6a2960bdd3ccda7a55480c70fe790d712b278295f8ba8b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "4faa7ab26891475a3307d5760fb8baebc977d852d20696b5d8793e026c48cb89"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "2f2f21f5500fcfc80d14692661a58dff60377453f36398194795e2d8a1073891"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183632, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "3c0c3c661be27227e5588f9386e47de59816b99e389597e63c34fd18a8e271c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "1d9778ce9322dc9da55df2ce2f37ad928918068e95278eb1c3ad50c86bf5512e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212984, 512, 2, 1, 1, 3, 1, 0, true, false, false, true, true, false, "f51d83718014961714354202c04d3264ea49779bde7ded9bae4a05724ceae51f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 212808, 512, 2, 1, 1, 3, 0, 0, true, false, false, true, true, false, "d3e45ea7263410ecb344469f5779582be3dc9efbbf9fd3a214d3c08a8de409cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168912, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "366730c9c681bd5bbf394c7b9919306ef84ba31ad96ba3a5e4785429742340a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "1e308fb975542ffb07c96ea605180894afb0c2de8c262f102ba659d0c2a72007"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "2a1ec0c869feecbdf8c42aaaf49ebd528683bde8dfebe2f4ca2925f3183cc817"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "d0de9affa98a5a34a65130597f25f79ea29d835b2a89e72b7927df9f4ba933c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "ff774eebb78447da9e6b005a4a64fcecb665c12f917a83ca080384ec6606293b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "56b279042435f999baf65776b6e0220ecaca3e2c50bdc52afe9ef1b1117a96e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 208408, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8a6e8d9c9de5658c9459c7fa4a92f7197169b20a146a7cd9628cf0d4c13e7ebc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 208552, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "bc8aebda5e52a77607a9c1c85af2578a00b9f6544159bf4730643e4dbb476174"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 195480, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "afe652ffc264237e7945515f7104fbf408a12cf7b65df69dbbffc2e278e7a478"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 195624, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "60287a04786702229bf17b481ed5530dc9af9b9b61c421e64b42c7d1a4306c77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "77adee886326a57628acfe04beaa5febb1f2d694e76022182c293a50b280c542"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "4969646ca6708bfded6dd3d9f47eb9ef2011338acf918eee4404512360599461"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "b1bf2303400a47ffb7a0f016ce6606ffc4789ec96452b81edc9751e8364a5391"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "0c5d4d09a21fe601c218a4205f0f65854d137949d31065a21624ad9b09b54de1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "92bef65d19abfe6ec244cd5da41e819c9590a991ba3f5cf6742d79baae512f5b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "fb3cf24f08c1f7be12b0f33912f28faa09d03ab96c683383d9b1fb0141e7b607"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "cb37d708ac9289e7d360bfbc2e3f9f7caac04bae46dc56501229629bc6ab755d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "feedbc0e6c06ad1fd144d2bb58f8cb595d45b7ab88ec2a884649da769f7eee2a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "d00e81fada174bf581690a34742838d064a2d37e24ecec44fd659813b05d1b87"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "d63f80a482740cba0da7079f691facceeb7fd3bbcc13f94516f4f7cf18723f66"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a598818f989ef67c928b8792dc5b9ef4ec268c78f50f4d5af927a9193d11fb11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "7c165f3c3a2dbdbc0434599e86648160627218999a290a5486f16675fa76eea6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176624, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "7ac1c6e945d7cbe96c1d21fa9992d7388cee066b00fcdafa9deb12da37d63090"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "811ff6255f84f00745900f4a3352d9aeb447908cc0f38f35a1b400fa8cad79d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214336, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "e0c3f1f3ecb7b8e9ffd1d681ef3e76166ef18108de7a8a56a41450a74fe74c3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "e5a2bc343df571b961f3a399e9a22d0c190c54af2a0bdd397fa8637a80b394a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163680, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "76a8f0c067e370200c5d05fcff041fb605d22754b398f7d8c2a4375bc5e873b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "5b551c3686416d274d6f796a8dcf9da24699e0475fea19f214131a23c3b4ec99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "951ac0157b0d5275db32531c71049b7471fcabbdfcc3e82e1574a38f86d78481"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "a26ec003e14e0a936aa346a01a8cb5e049caa033196b5d44e54c630fa27910cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 179648, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "611925eac7b58eaad25d87e8bc4b532442597141e9a68f5340eab3606d69ead5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "1ef422ee3588b97dfb0b4d5b7726f3ced24afe5727c356411fdf6803e1337aa1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "045dae7d3a4e7b3c419e17d4858ad870ad57ac3e483aa2f6013f49eabb96c2e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "478226442f7c3c3ba4164c327db45d25747dc2efceb10243be4a1598a60f26d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "2209842cddce8604250adef56d864245106866a6dcf2b53001dd845467129847"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "26f0143c377f10f9b05b583c10e0e387580d9044f3ef519bf81a228f1ff58f43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 164672, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "71559ef06d8bedaf98c1cac2873a123e2d9ad573cab0f6929cc864d8085e7f25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "55af1a0a5d5d78e75520fa6da50cf687f8265d5ee713acbbb13b3b037394896d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "49b951e4333b61ee48fffe26c76cd24b4774938fe37413747b5c3f7796fa3488"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "c5d699fb60ad1f25115ec2bc06c980534745aa32443f87b522b29ec6be311bce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212392, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "5d635f5fd4a5686c6ecf145fd34ba5532f8d83c532be20f4e86c6cc04c044a05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199720, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "48503cb46877901c2331529bc161aeb12edd7a3d5ca516db3b65922c530a4e2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "01adedb565a36741976891cf33671016fd7e902a1eb457a22e0dfe5abf472459"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "c5f0ff2472af04dee29be4cde807e034e6732c24f3b92abe9153539e3b518ac9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183632, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "370ae4e3758fd0b3463a2195072d5df8252279b3456e93d3cfcf97325b9965e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "23ddb6d963c91b890b35696373b28195f35cd440885f2f89481b3d95e183dbc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168912, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "ef33eca4943f76b55b566d290ca0f462df0f2855a4da32dfb953b81c92a7a8de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "812cdff4732e53370accc121efffcbdc33f99479c771a4b781e13c5afe7bc33f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "2e1289dd30eef36d8dd30007acc98683f7d753e1aad093208853aa0f399cf226"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "72f6d025501ccc94175e2d9c12f9e22471dd557e810e68ed23f88dc4d0b64930"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "cb085ac15b67bf5659016335582e7277340afd4c13c42b3d65022261a685ccef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "52955de13bcb8d0ceca1c0e0b8fb99aa5f300d1b39b984dc1f3cae8620e5bc11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "3cd31cfc8725dd3b340a9d69ec031423e6268079c880f3ee689e5dc2e89d35b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "f553b1810cc13809eb73e52832e3838b139866364f8d989f0ef02f8f89e99579"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "3e85366a3510229b75dcf00808f9ee5058c8055f3a6d436335cb43b7eba8c6f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "746b3b9bf8ade66175278897f20fb26d52d9655f15424febe3cc525a138d9ac4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c0e73baa5c9f6b1daae661a9ed990a5cca8e58d3c66909f5eb8f034b18ab65f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "c9bdd971ce144d1bf0ae57b4c940336f2dd9274ab52048d074905501351ee2c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e1f0a1e2bbc2332c85f47141eace9ca19bff7cd0d027353bf9b6119fcfb7ae7d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "4562113a43d08d7479d6d34e4e17db688828fcf95a8ceb4d77752d8aace82301"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "dee1ac998ae01619c44ef8b82c75dafe37fecb941068f7be0404a9c82bb25803"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "582874ecc102602634cab997506cca71bf3186bc1efe81fc58f4ca965a3f23a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "8013c672f3e6d3e38ec42a5e5d8662aad4a0437e9f3b04822b619fd202a0273f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "78b258f0f01a78a9af6f38838fa95a036051bf0fc52ea9be64cd1e861a2db285"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "0d26903e0ef5355649fc3175ee77bb18ce1203ed9f95fddfd0066708d4535163"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "94c13578c7aa44649e63e81d5c8801ba8c1d3591aebc02813355867900ffa631"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "f35f1655a70b132ba7ab838b472a7cd6d1737469b92f41f46e49929110e7e8c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "23ca8a92de71abee20ac913a8d37cb2f9bcb371839236e24561fc79014688b71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "aa9c478bda044e476e7cf3507f18a17a3dbbc3f52d7cce786f3b68e8245b8147"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "21bda4b3e3c4196277b7786c6df8e4aad6c309a631fe0335db7b5c61016014ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "bbeaf4d7549aaf4b51cebff7d4404c2879399058f034551e49d8460b76f62afa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "20d1a09eb61583b2ed50b7836d639fa472bb2259357f4a033d08fd529e07cf48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "fce59e0b9d9118a035b344474cb013e8e5fdcc7ae7b8275815c0f4bd7ae3da2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "f3ad1dc6e7dc6a0ad650592674231796be7da85c757bd477a609d2e4a54632cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "83438f208c967191e0312a86bc24d7ee5133312d7e3b0e9c8c42abca1a6fc429"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "433269f93e11301fa34cf219555c8ace4fe54b5a340b491f25493e67218555bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "f919c9eaa95e9e198a9f9ebffe4bcb5f409fc9b6a43db616857ba6e8a993a301"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "3e1de78aaaf55aeae17663691185f987bfa25545437225613064be3f5390442e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "7175d7273e13ad8735272dd78183dbb9dd3813d2f210860f30d40b53633f51a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "67e864927ffc7e3e2c67077c249716d4d78a07e99091261b10cf7e2e74a7b145"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "4aa6189acdec41cf47f37145b7fa1ecd0064ebf14efcf97197704e5cd613f8f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "d86e90b012568b663bfae26edb35d4fb2241d949fe888b5ad236f476ffe2891c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "d5aec70a8b528068e02b92e2e85fd507a61deb5216ef59940c432fb1841cbac0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "25cd773185d49a426288793776266b2501bdf87b70a5417103bc5a787b3f3c8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "7859ea47b5f8087927e20c302048eb973011c22998951e069dc26e77e2bb294c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "5f7b0d32a74352959f8e22df4d7764b0dff0d2daae12d8987817758ac4c17ccc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "0e740ca9736f74da002c0cb76d861c93521977d88ad84028018617b2dbdff808"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "7acae625c6440d90dc9c8c9717462433d8312be2ee51cf258757c02f42c334c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "3ee2a005d3f0061a7ff958b14a8cb947cd32b686bb23abbb0264688edad895bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "dca00fc078629789b6e90c78c41a81364b2eabd021985df933380433f11628fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "ff40a631af17b66199a0245e6bc073b832fb2494a04968efbf15fbe2eb5e4244"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "b96c9ef020af267fcff20a7e5c3742487b40f66e7020840186a83d92988cb0a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "9eefa3b81dd33b7e82169ffbc901f292af8896c8fa056239970d86f33c1ad5b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "e09d7361810f17725abdfc70f30b6842a06121cb4fd82539a7d6c3ae9482e9b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "f82d7e470360f905ce26694cdd58f7fbe97c75d6dd5331fe16cc00bc0d535266"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "935c0362ad257fd732a5f3fb7b64b09ee064e94c680f1128e2b8826e3c02b035"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "e27d87b04bc23c4a636fa4174903578eb1909541dcbbfbcf7bf681fd55c23871"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "900b50cc8b9bb3050deacd151b0b556b7c1ae73c02c58159154ad42cbb6b427f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "cd7263d82e5e662546fadafafc959b3db6e464c2442ef8884db59308a4411e24"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "8f8d46e2d3e5c700c7defc9521cba71df19fff5d783c35d163a3901f376ec2ec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "1338227c038cdbfe8e4c559454559ebdb82549c2930e4a548f8640d77496fd13"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "37e6a72472b2228159fdf5e4fcb5125409d960f4fc25328f7f5e1cedae88f8cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "569f29f87f20772a12c699743286de61792f6cf4400ecdbf1b5f96c32bafed28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "502878be64e8efc1d050782c994f4e831a17c36c1e9b10c573a27c191ed079fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "ac6efa5a506da4c884a0b630a5a7ffb95f41f708433d02ce1b9909f187962411"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "18ab68026d7e08918a0184f9ee2933dec7bd2e987b22785c3ee169f5e1980093"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "4535784428c4a4b07bf27b4337eb11f3baba5b145c74c5d201f192af3aec9414"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "16b76e1e26bf55d0c17f4486291fcfb3ce1d167c705a2b7fdab6210de81895ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "059225707e45247e0530cc093e569c627a17272de7501d7e22e04135abe0d599"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "354ee44478aaa286ac4f2b505a8be10c509d7ee36d8049dfd647429a8475a571"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "f758a3659033a0a349637597153716e6ffcafb6f9266dd63d7da722cd0a00f21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "4fbadb52070102941a0369176856823de4a56d8acec7fb61ed3cbde78578df01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "7e175c8f547ca05bda6d404b5507c349d7ab6162e361a64732cee76d39bf0738"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "4f6ddb2b7b08ca52b49c294820a9e7fca4e316d06395ffdbfd93f163a127edb1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e94e80afca1b5b59fefafac47cdb5d5233f717911beae9fbc52268924df72e20"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "20db6138bac3228072e7d558608295b23ab4fd20d16b5b68c03decc717e015f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "fed86eb5c3e348866f5feabf68d2716fb3dde676dc18b9162382d09d7e17e1b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "d926252d9c14d832f0987db9ddd3e90a3d50a40fce10d0461464383ed25ad4d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "f85df78f37b327bedfafd5cd5b4b20c5fa710700c58f42f8ad2e46780b860f8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "6ca85138cf318415848ad1e91a3e43a1e11c3a090b0535baf69caab44607105e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "95e184a3ffa1dd1f10f4681a3256586b3dc62d36f6e396f479ad5557ed0cd4b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "2e9f6a1470b15ff537e376927409048d790b5ca92fb6efcab3eb9c2e2e8973c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "718524225c7b5d18de5079bc24acfbee942e06141f333661fbfbc0c5070f13b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "7ae99c1c162b0af7eea4586287154e5fd638791740aa687cedfa5311d563a42f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "9c59d904033efef23d250a7133c98248de5eea70ca9c7c3f15a3022b37c304a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "f8606ff38c70aa75f9cc413f1e7428d2ea64b6308a6b784172866ff1fa82b246"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "df38879e1df29280dcead7525714f5bf3f0c646a0c29b80aceac5d6d13cae2b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "ba3da7fe8757475417c45bb33d2f9044f6c60ff4fc047e2d00cdc9dd2d8ca3c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "98aa9c308da903c544875c8908a8d7d6b9b2c11598b496d1ad276b6bd8165e6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "88a012b2567c3a34422e466d890a204820914bac301d2dd286f25e83253c00b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "4bcc9c731fa6757f5196c4154dc87846763bed3eb8e7270a9177d911657466de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "0249306c8ef4c6d2a991c975c4beb29a4ef17e6aa5847a8e78049289dd094090"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "d2e8f1f0b680dc147f72aa1a64b525ec74ddedb05fc57473d9b553595ab504be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "c2540b6fbc8df0cec87bd05719229cac71c8f1cea8eb5f6903360f746481808b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "10a99f913e73c878c76fb79745a43174aa0dc909b01354e1aa741e2147d20be7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "ceb7a6ca4e7d34bc28a41881268592edd692c54b4c63d691e6e5b65bd6f70f26"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "2215793490c1fa4ce525a55f7574bb77e2be492c8e2446c89c1d7e0347648b58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "093179a1e166c80db9d5a8d9571f0b6d6dfc8e2e99a1c3fdee3a3b13988deb84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "0cfb0ab7e5f4bf1ef5c4065138712894bb26cc6f8fa192103fe141151f6a1c94"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "1d3b5979f09b813e3109fb3aca55ba3e157bb70f10c031cc0fa0156b64f766de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "060c110359e8dbe47ac3ddca883db6d05fe5f7f468a62c152e952a3ccd91f268"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "d49c38518f2e541a540ca07c0f9fc11fd8c8dd3fc9e6229f1d746ce6558c0f9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "fc467a6c6febabd2479de50da5e426e078407a97c213a9f9c88721b07671039e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9cadf8895deec7015d0d6a6249d18986715c9da6111d124cecb44a85753119bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "009f0b102f66dfb1cf59cbf94d626e5b8e58526178d0282d4054c3cf53216bbd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "50bb362144b4991032500d0b09b119f7b18cc42c17475548d0fd6915bb41ffcc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "bc039148dab46ebdb643496b9127b9dae96a17eb276eaca8ef5eebce4e9a9e6d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "28ab9f9d07f4c7d3cc67decbff45a851282f6f893816b736201b8c689a4afd12"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "2befabb9e707259664408ad8267849a564b4aee756e180ffe11486c56ad7b1f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "568ec6b0602f77648ef9a0e68224627c70d8aa00552eff396ac8bda45a25cc2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "0c7361b41595a48942ab7c075fa05b7fc112a42193bba8120c3eb519298cfb39"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "eeb98a36b2459cc342c38474125c4ca1f8f71308f36fea7066bd39d6d7508769"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "d1ea1bed2c2aa52329208591f54755af33838a1ba168ff28daa3c8737b0b8542"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "2ef333b52d7a1721d5f353b3ad9631e83908ae1b0eb861a270153be0c7c3c34c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "525804b809df1db949502a84581fe206633ca4cecce51d664f768ab182b1076b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "e5e9678469df47b2a2b3172e8494f56e962215262c41cb0297ddeb38f1b52e07"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "e6bdaad1395b7b0af9c80726c12202a4c8fb384707a278e6c3d03937ac82c5e2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "ad1ae4afe9481085f0f69d98c75a3434475d85ff24b5683f627c5a33cd62d386"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "7d9fad3235c6360ed28055216b42956a276166ec2dca8bb0163cdb370df20502"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "6e8350005618ef1d7c620354a8928f203e8b8cb720982decf7b1a255f94d769c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "506253d0fdfefe8b6307da082d086e9660a01b547d56e4b628fbdd733cdd14e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "5eb879ef9e028e1f28446923e4d443a57e90acf8918a5d0167b7f34ff5464472"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "232162645c7714583979001c38177c06b195fb5d806040d84f4a4e0a66cbc31d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "14f957c25af9189cc18d21f729d427db44d786bc62b36e2c13759f72b2ad66f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "8acafdd052ce9f0bc69bd90b77a766c14c27d661b2fed093d4977b8162227553"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "0f7632e18322dabdd639f200b0539c606cec53db0b17c397fa95898c6a77f0e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "14e7fb9e27c1d9ee41d2c3293a4c3ad4fe71e2c88110d79d858fe615585af375"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "103efd7c5f497115f7c21270e1aedffcc2c67d67de24ada39155505a31c9230e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "41bb352ffde8c2d8fcdb7262ef8b745dac70ddfa31bf550c4bec42e3a3d83dcc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "f3657d949a49ec32a6a88f5b56fb5dac090d649c1f0d7c4e6ab287b9a3a8da0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "5a2a931d0b9fb4711404f8753206d5c1bec4f346f110f6bb1e80c2ea61da8560"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "66d816a2d7787fe3ee22b3d45baf12f118a0e7188f4332898441e1ea5f6dd970"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "93271267b8f46653be8861f6687eaa09dd4a422c35535dbf91e52b2f70b9e598"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "319d6e9d75624f49be62ce058a2ae81abfc3099dac11a6721f681dabbebaabe5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "195c5d303d6a11168f62fcb791938227a59eed7bdf82b55ca92aa41ac93aa35f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "51ba91646c6ef3c6946103268b0cc6b990ec29375d4ae4f4cad08e781e98c060"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "379108d6c5ce4d1647b25621def3649fc1076ce5f884fc00c070b9986ee0d169"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "6acaa5f91c3bbb189183e54bae1ecd89826ad4f70235a57bd08ababe66c90f89"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "69418051e6002c25bb81b52d855d96cf74bca01b0af779a49a87172735080847"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "98960193018c5f8a9b0037d99f67030a3a43567d862c13e40fe5256915471844"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "f1a934a2d0d84fc3ef4326ac80688b08cd84bb9554551552d55e6ce87b0cee16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "5b984e2b90f5854af4ca50d8611f7304f3a5f80697529637a8d01f68a3fbaeb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "f018e4e09ef0a17fea57d2601bb5a819e015f4aae391538b6c91ee3f27028cec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "7703565f4daed7720eab6ba1256fb7c83352eac4034c14aeecd758f72f7be42e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "20385ac5db02e198e0be994de32a4db779befd665e282b1c5624c3c1b9f84eef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "70a00b4f82753947fc19ab9a4a8727e09870423750d2c2b8a67701df82675f57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "46fac0f90cc962628c8270cd4fc958bcb814484595e54d9e73171ae3434551c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "986743f49e6e57799a2c36e375e29240f9bb1ff203c492ecf4d064fe4e6bdca3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "11102874c8a1101d356a6f649da85e2088920ff32ad9ea86cfc53884cbec2037"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "5b328b0f1e77bb3c24793c076b2561439be4347a35a87a03001d483f412483d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ad069143b32eb9f9841a35d7ab604c6b72ff04b01b7c8de5a8c0d3edc41ebd45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "d95635d0a4bd9433b8f89ae675c02f31f179b0b81bca3d2bd007c75432a1c1a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "089b245266bf05b78d39bfbb1594711b9afefeddc12915772dd5b6fb54366946"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "fe3d89b3e7f75083fb41988b6cd4e56b50c1b3fec375f197b3d735742355f45e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "3043b74b6065961ff599a915634d0b94ef84d8123ebbfcf2ec61a223a06575d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "9631f89c22ca64ad0fe5eb3e40fac5f666f24c7f0652caf03422b59b41d9119d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "6ff31bdf2fa48438a0ce99218c76a617ff782089ad1eaffbdc197ba10395bf7b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "48c9e58e3857dfa86c9d489dbe6ae7ac18846569e52177c1494966f37fa2586e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "54d12a2865cb091abc301e6b8ac2a4a71f7bdd72012dc02b433f24658066915f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "524c76a1673b00c24b43d2767fdf5ece7b4a374187ff2254d309b6c3f8e487f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "e6103c37823ddb650aaed9b43f1156aa0433bcc105514d91ea94080cbdcc7eb4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "360a788dad0c6cfe9927ad5a32507f1072d9f3214d5b28f278d8b30228267a3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "179bdc64355512e0726328abab087ad540f6077e8fad9d08571189a617bab075"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "2ba066c2d0bc468b4a76862d8b7916690fe14ee1904fc91c93f3f724ffd11f2a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "074888f713ca97b28d3230b895a585dd64d9604160ea4c7e61cb092e77699b9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "ace66dd9ab152106d14a9452117cbe5451f97d8f47b4c37e0a7ca4ca9334f69c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "c93f52c5dbd2ff1e52af81836e7475f7229bdbe35f1cc842228b6d55b50142ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "58eec25f3770656f5f50734b80a396919f4ac1f35e65cb7facfdf52b2c149218"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "1350909c59975455556c3b88a79853f6ec1a22225f020e38387c026309e729cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "d66698f7cb57cce460ebc9232cf432301b75ff98ec80d6f318ef8d5bb441e71c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "86a905cb9c9a194f1118ae4336f96088bdfab410b13a62282bc8ff44b47e0bfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "db8010a119ff85e3eca0cbeb5f01825aed2fc679e48ebdcccc88eea459f9413b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "ddffb9534cff004d8e36fa8b30eac49a272e45a0ea0a486c2115fabe63b83efb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "e8a70c0999a13d37881e0ddf89518ab00fe9a90c7103a342d686d0043c70035c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "2cb05bec2ce33584ba090c3311a83e3c95fc43ec03637d4dd34dc5939c1adeeb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "c5efb4bf753db1bc5958181158a4d71042de6d7fd924adafb9de52d4d28a6495"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7ad3d08f998272e58b1d7d062eeff0cfdf159446877d0c78b91531c3936173b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "32ee655aea9a490b12939b1458ee2a44c8761d3f28d4481bd6e58f82066be0e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "8fe08e0a53392e4f442d434d8017712ab1fbc06e2702316797696592f03c3a29"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "47f57b97b9e828fdaf9121399b48517d841aad99ca05015280e01f75bbdbd6d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "3c1b268dd3423551e9d89b5722d8db83e6941d4606b2d85fa11c13ae5a96aca3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "8f10a6ea966d528c355d270c1c5ba1625da6c8339043e63edd67a7bfed9b28b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "2a316f25f9af5123d8255f6f277639558f594d6f5f91451792d5004e9f096854"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "25e8fd268d721fb9b014a8f1dd3ffaeb0c519dbf4edbb0e296754fabd71230f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "054777aa737d250a5fe0cc484b1ac740b862c60f8b0e6f3fd0bc48d20f9ace1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "0d44e078c9cc7863b08d7a1e027549c763ed9cd49f4489ffb9c7035ef88060ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "a1ee6e9329eb58654e96e2d4c6cd4a04a2d0e33c9711aa5ec4b077183103ab8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "f328fae48efe8843c3c0bf08342bcdf66cdc9ce296f5f2d36677187926f0fefd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "cd0adf1dcbc29f2e791b551c8ce535dfc5850ca93b161c57c734e1c50a5819cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "d15df367b932633df141ad56e5cfba8c23b5d4987eae05c192d94a1ed3c09fc5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "2fa4cc229531c167e61072ad6e81959dbb678ed104ce83e4b871c49f92292d8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "58320c87cdb850a66b96eac11e7843a73f148d57f97865dccdccb99f69035304"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "063f4b94b21708b2a27a3af788eace32b2d02737aa46c9bb8595fc4e4e2de1e2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "205242eb3057f00849db2e75898ecf2823f67f8a055910e821ac6d3da087b740"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "4163e936fa85c18767281ccde861d9a758a55b0ae5f503c08545b315a0c57eae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "74c7c1388a2463db94d86349a9710eadcd0f770b6e70db2aed66a389d483cb53"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "eb62613e96c866dcafdeaf422fbf1811d058c02b1ef38ab71150e13e57465c9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "91e80bf7f8982747a54ff57780f4559e6bc19dd23fc336e951e78a548e144329"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "4cf6c49f4162a60a744eef144da2d2a53ad15a0523f98d8ccf1980996952d6bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "088b7213939e3f9e1fd5b4d90ceb2f8fcceede8436bd1cdc68c848c7a6c32322"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "e930a859884f48aefc725e969fd5089ecff27e54ae45ac0e365304e44d0ce6a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "776a27a1854f8e2d9c459ae6147d3d29df3b69066ca99e4c533d11340f79295c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160096, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "c047425462db41aa9814baa4ede9ab54bda78f49960103ecf7f4bfc5554cd368"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "6d33def0d783be5d9b20be54df2a92147777a88ac1eae287fea1c960b77c0f59"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 171360, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "df17c0d35862e7251385005564a0f51ac9f30d1a225a1c7a5653ee9d3421ad25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "23da366ed711153f492b0292011569be7084f1627563345722e0146708c8b7ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "e1dd2d31b1206fce2912ea7e9f73e9d3a97bebe20c561ca81b2dcf5ab84de978"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "64f7ef64795ba92a15f58c766d1543c82957e41bc80360a80e9d9bfc13a9a3fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154464, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "73b848f358478a5b758c08d6fb1641ab3aa8de723e0f5547a68b5665e496b08a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "2b3ee015b8b9f6ff233fe68808cce2746f5142af4174806dfb8350da095c4dfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "b7a6dc8f56c52d78a5cc8ed03ea8457510b5cecc530cf21bded305d55df6f786"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "678ef7d123c6b6e5a3247adc8f03daa074bc912832fb2a5c555a5c172c45071f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "5a2446dadecee92fb7f8a698e5f682e7d3dba04ceacb9b0e3ba762371742a4e2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "b96ded67a32e42e722f34911a4c9c20032f91ddea09f772806ac2d508bc887ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "6c56bff58b11a2f7c5649e69459bbfde916e447059294ee2c707f23193f16502"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "396762a3b63863e6c52d0fb517b8bc48fb4653c561fca95d208ffc6fcc1f5189"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "6212f8fcd29237fa682f3806b3d23e9509363fda660067021262cc54fc7afc74"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "d8c3ebdb5ccceb6b6e5d6ef170d06bc44ef98d38455e3ee3484e7fd74210965d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "dbf9efcd142f4b0378e8da2296cdad007cf1cfcdd820888dfb14cac340462536"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "4c392073a6a9ac117c091e3d0d3aa73ff40e6897f5aeaf391a9b450f9207f111"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "eb46746b516dbe56784d4178e4e6a691990ef8b5a653e6ad9994af844d0c5aa2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "dd2ed67a1827f1d5f11167129fec9b68f13ce48a75d3869dd7dd82b494789969"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "5ff8601bae5cca913530889558b1ef1e65c357c91ee09b76a16499c462d60d5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "9ae7c18e24731b0a1bba4fa5ed35d0eaf5c8c8aa5fd51b7e45265200c855a3a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "52ae94a6ea0676726ad99ffcb64c758484bad14cfb008fc20c825bca8c2e3733"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e50d0ac7582ae1977d33af0f135d7f8968b6b69be8a489db41db767274949f9a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "43775f7aa16b652dbfc049bafaed398384216d5ae5061be9caa61dbb1c76ed3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "cb7a0493e9c8e6f0f6c6bee2eda7feb30144ea5911056939e710987fd41b5224"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "ad377598ceef3af99cc62944aae00ed842a63948fc47b36160552337fa9021f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "8f9ebc9b367c6f4bc382450ec2686736c5b861008c68b5b6e2220e260d397015"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "9e7787c741819a7a091bff5d1db371fb2dc82653bab59eed1599fdac4e72483c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "4180cf24747a90a2f99182b49e1c1c7728d7ecbf572a7d8124a4d70072f67784"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "d467cf738af0a15d5af3ce8ff565f911c2ed1ef4a50b924d831309f9fa1fa99a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "bd68e585ca590307ffe981fd10d8e2ed887f6fc643f3237a49679e9294aaeb41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "c73b875cb6a8a8b19fca63fe76f71542fd596ce177df69f370444801e5f3740c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "02d5f5c04d005d385cf8b57c6c311295c66674f4c181fa7c0492c7bd06bb2c2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "41bff44ea9e5682706e27dc0529be16e808747c6ffcfcf902697567df48b8f26"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "00201e5f4c9b68e0c2cc3e4b7beb2d977401e01cc776ec65ac6d9b76cbadb473"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "a8a85e16769d1a4f28879820960100a1870cc0d492cb5ee8dfcda08ea605e5ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "068a8cd0722d26c97bca4a8a5161ac1d83be369f041b67eba1488192dda98231"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "14912c09fb2b204ff835fa7731fe64e4667c949d4e135a724445f199fd7318a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "1751a235b7b4634f8409c5ae86fc60ec463bd9ca38c6238145c1ba7606cd5ff7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "fe27a1f2a2c42657a8af339cce5c3b1db3005ef44950d5b95637bfa0e5788655"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "746d0e9a66a9d65e560f2bfd2e1a4b2ba2082e6052a3afafdb10dd5ecf70d940"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "3738b6088a29d22b762b6c0e1609ded4f098bbbaac1d64bb9f7861df89456f2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "e461c7fcce2f3cdd616b5cc9394956546d9717c93768cf90716c4c0d107cdc4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "410a838746745c14e0271b1804f39f2a2c8afbee817d0b26966ea0bfd82343e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "7f8487abd5d5c6645bd4c70eea99c6bf221fb3a924b2daa41cdd2bd9b32da74f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "b8bb5d9b78330e4ad400859111e9499e46ec554bd8885368129e332584d9749b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "f90a06d92e2be0ef3804e11718fddd20bfbcdb6d2b40695559e38bb84f937864"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160096, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "0635ae8d682b5be8f306ab81ce7eaf1241d51b75dfc3112b847a91c543c4310c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "732476780279dd61666f59e59975e84f96c4b55f4664b5c6288276f920fda8a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 171360, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "65f36802f48373a2bc169da58655cebc5fa59f765518a5ec7059f5da8a43af39"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "0d78e6e87a2007771f546d1f55cd1c38ea9de8379b1e7cdaf8ce856f3774e21a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "70d0d9e87606959e222ca2d8f7bb0078c2edbc3083ab826b73fa9f92fe5187e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "4799b19691588f24f3c015246468ac8905c0fbbdf4c0d5cacd2a2f958f79c957"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154464, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "85cde4d841e534a0385bc6a19eec5459f8fe871cd3928302932e3a48d1025119"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "d3825ae52db39401a316285178d4879c69a4930dc2779547ec41ef99511a32b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "53d8351641da36463c98e6575b839af89368ed1588872608da561d3b30529619"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "324fe7f7c3ec4f4e34035e0ba266c9e67d1fa0cb16e0d10ddacd76d5b73b4cf9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161216, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "2efaefc6b7f2840ec1a2c0cf6ef06784ba68e07f71dc373d3271e4198d6efe9a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "354a9bcafc2d2d75576aebd3edee36098c0d4d617a8e457ec47ef9a4174a213f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155456, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "86098eb2ead9d7dcbdbdfed23e0321d295baa9fcccbb2b26b48156b4d8b1efd5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "d8d765f1f2c64f14c74dc8f34281989061c08fa993e40f2b2123b092a5b69124"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "b6f9dfda0165e70cafa3ac4aea4b0f5208ee83df3d7652d4ccdc0dea09aa0fe2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "644ee49e168310a2ecbffe865e1b0cedf4c932974de9c963aad284d45f89bdb1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "f47d29d4bb65c9c727f6ae0e55557142ab07574991a47b4762ac64be2707c678"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "629983bcaa30fc561a2312daa4dcf68bb46deb41951bffac38646ebb91cae676"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "2ede7788074fa03b725d0f9755180de963852c2f887f3c3837fe2d3e145d8cd3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "dc980aeb7d291e18e27a1570f959c47c7d31ee467b6de38b997f45815bcd4143"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "bde91721f1c4e8fc4d8f8e7fe2830485d572e5ff2360bc9df69d2edce1690691"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "4f4fa447dc92722741652124ccce4de0a58335ef7a63d543f24face91321f5ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "67496863e95e398d2669e7c9ff5002ea0af867a71693e68f5cd87b70a099f797"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "516e345c5442923664011b2c19cfffff60a4dd254a4a975ef4dd1c77913b27a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "460ca7574d2fcd87f75fae1b35333c8f3a08ecf4b57477f3ca6f517e35f5911c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "335d2ce1b66a4adfa5cbd80b88a7eda05da3028f1785f2f25486ffc31270978b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "3d7c47fdf858821d33494f623124733a10367363d6534040a2f41644eccd7cb3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "db247391535be62b9e2cf4769f18bb23b9e26d25fafc5d621d51064e911c7046"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "0727976c1779978ac7a92ff0fb134488300c4751f17e2a4ebacb92093053f21e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "00e16eb7750dcc16b9eef25e639bfc554859a297ea127d2c5887e1fc35fc227b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "0cb752d4a3223be72b32bddc55c1f022eab20b2bc0bc3a94002b6fb0ad83cb05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "824c770561374b5ec886fa0e0539c6e4fd53246a77f8bcca10bd8177b756034e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "739bb6a0322497f0b3d1ad8c44a4ccdf1ee12c55f9d554b12348c7969ff0455b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "2428e325e6f5123d0d144669f28ba81c05a306b15b6e04cc01a8b7c224c8584c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "dc50799ffb22027f0f4b9ecea6170525c06e7fdf42de7a5b6ceb9476e431092a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "3141040a04a8fcc07dfbf2fa15742f4a42e48f07f6bf17d2032fcd9a860f5c1c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "9673754756bba0bdca5915fac9916709f47472542290eb8e9bad3a5af48d25da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "d738094f68d868e2895903c0a2a7a821f2a3f5816b0765be3a5f15923fa035fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "dc48d107d1beb85b3a0528578f09e3df982c67b3db1453273080918614b7018b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "e072755b6800ff69a723fbe56dee8dee7d984baf5475d2959bd25ce016599af5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "b4f9dc5d4cf6ad7c48c506f07789f0fc86815519634eba72f637c509eabdcbdc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "6e03d02b2207290a04135541e57fb9295f46d7f8e0b47373e1f9880cff75c28b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "8ad2ced65d2206dd986c368fe8d38e729ab06853f662c8bee8ab1f945ab7cf12"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "e5691fc3e80eb66c97eda91dfb2ffa0b3ce16764f840255187b4f2ca5fd41b49"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "e0b15ee273e935f555dbd95d4c343ab9067cbcbeb530bd7c5f4f715ac6f04cf8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "90da0886efd4423db40703ce744b305c7ff9f61d0622ae8e7bd3b49559874fb7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "e669e38c13c9df1f0bb68ab00a8c9a8559a68fe359efd918f3891769ae7d7c26"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "efa63b6e96af05ad05c273e21c4b6ea3cd7a2bdf252b7ba6c595e3685106992b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "b27b6a6d90d8188b7e7667b3b8c2f92b764a4320d0b4a9351d293bd64d17ccc9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "466ce09145bc8572a7c6b014ac8e04c48dfdd73f41fba9ad75b5605a7f3c52d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "1fcd5f2f3037f78b1c5ca4ca59ba982d8400f74a1365c1df32e960849c4c28d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b99040bfdf8c463a465fcb9d5599d4b9fa31ea226fc1c23a7f1481e8f4690e47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "d334e988d1c3821da05728c8c10d8cf451eb1ecbadbbbad09c1dbebc6d5f8295"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "56c9b508347114a8c72bc24d4f59af2c56282516654120714ec89a973288fa3e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e6b1def62c83c6ce74ebd742e29c29fa434d83d851f5aaf5aa23515c9c596308"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "6f83610d9f0f0183cbc53dae759280134ae1a7a87c33362f5065fa11479b7dcf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "6592194b3d3757b87c465007ab28d9c6e1a26cc170c5f6c3b976ca7077b50bcc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "7c8bb50f9131fee3374979404cb7ea177cd88713ad475de4dafd747de9ef2dc0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165312, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "e3c3d2b94a92ed0f03982dedcfda977fd2c6ca3398fa2b1431930faa019ac4b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "531318ef86dcfb45acc43244c0ea567be971df6a1eef148eee761b43f609aa1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 157504, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5eb14133513e270200c8262736095664df024c1d9ab31bc39a8a7c55b35e92da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "bcb4e4e8033ffe3021708c0929662d3cd4182b4eb2cc0ff13bc2d072e6a5dbc4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "dc1495d5e0d2fa7015619a63af2f0e78fad5b6fed9458ef1396c79a48231607b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "bfe4276f6a42c5fb310d8d58e428b041f79fbf214919d9a055629d731d4e1327"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "b0185c7ba2d08f8ee7abf1be34b8ab24274d84873d79dfd9b5146ebacc0205b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "79b05be43919b4a180b69c19573d1ab3f01cc851410cebd99821b1ec6571bfbd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "bb815a614ba7b2b7cba280fbd04efa849745c4813f55c57f058fbfec1ab4dbee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "cd4d5ff631aefeacae4c28217262b06b1a9d66e02a57a55a42d1bae6dcc34fb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "d13cf88d391bb3e799c9ef9e8cbd5fdb8302c50279e3b441559e639091e776ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "6582f0553b456b8e0d60c3fccc2aa3ac2a254ac77c3dcaa00326fd24d0bcdb8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "e300bd3163935c3da490f218cfa51f0e7df03ca754c574d506085623ff8cb8cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "d8b81d09d54c7c7d731a6f1621ac4410b0f5ea0da6067ad1c1c85250cfd5742e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "bf0d6c99a2c780297cb4d7b3678c56d918c2210cc23807358532f0f4c6e3f54e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "7a06610a50db97c3082f12c571bf43bab52f8482bdaa0e8990dcc063d6ea1da7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "45959bcad770bba16aa77d661c587cca1cc09727a32ca3364c1c2a100cf8c5ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "1e0917d740dc1cb6eada1d83951e5c72bc87c011fb7e72cb83863486d4a80669"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "fd34feb1c9daa648ab940ddb8645ea9567452e93dd60cc57a89275ea6ad84d97"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "2d2d59737ee8d4e08b60044ba2164765ee576149a90c2e8e8d8ad0e721aba8d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "b7414d53c195866eda3ee9f0448c869fb6fba750740c75c1dbabfa907bef50e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "f178d678cefbd55b7a053397968329f25b733bc7de7b43153c5b0e36b564f91d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f467cb7a0ebea64bce09f97e0abfddd74d2cd3c2fae3dc3f86ee9a47f56891b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "813d4d5b7075679362ca49c76524e3f21fd3c2b7e80f5a532a08e251cb4c9a74"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "355e33cdbcdc512d16e9c2b945f994816b022ab8abd4d62c9beac356fed1c5ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "11b4318ff70bc97f60d209b439346d4fc9364271bd68a03f87827a2026c25f2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "47e67763a6e8a5b6f6d9913679ec670caa875013398a030c8acbb7c127fa0f04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "b4970a634bb08aef91c68fecdc98b192bd9ee72d0661e80026c950bf2c68aeec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "effb4be626f6694c6bf9a226dd4e34d3a0df2f0f36384b3b9462af8afbaf4d31"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "634eaa52b12741dd64ea3e0fc0ab39bf01b6d07fff3cf7f0c9e2f1871c3bc8f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "797b16c80188148c770e3f6c0eb22d944ea956a2f9fb144bf2d0bb209cbb645e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "57a2e1604d0a7fb1713b84058962a7f6a1f210e3a3de1ed927c1df54c72632f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165312, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "b64a933566977cad3d9965d8329ca0a2f8a15d1777e071179c5f59e70c89a593"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "3330a72a45771d01e76bda7d0c8b879b6bb9aba75fe98dcc9b5dcd10ddb9ec01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 157504, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "e98a047387bde816da5d7d9d5078c7affb90022144e22b702881f869b43dae3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "2d575ebeb1244f710cba26dd35c1bd1868633980aeb8acfe7ab559c3a3cd8c19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "52764e1c4cb90c6551a184aab128885f2ee901caf897d8fc08a953df27d08b7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "bed62eb53fc324576e9a7ed3a765182aa9f5299c612c0abd9b884162e2459bcc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ea22662ad26ecf04a50aae6e220e9cbcb4abb2b09d956937e00f6d60e6d18f09"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "c6502b1665f6dd34ad2531b763a4e12bea964866d89c81370b4be59d8ebf6ad2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "115988e6ce59dd321017e798efe340ed6e01b0803ba3af4c79b445a4803a0ff8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "16dca3f227ef36970342d197241050fb9b8050dcc3fc46c32ede5acef3a310f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "77199e994b16fc799cd5d932d9176fccf76c2fd816c7796491b6fb700d509d1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "0508aa6757d4c0cb3bd7705b4fe3df935aa8f7032f3b8b52add71533584d9636"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "5d701c1f05302541d6172295eccfc93a2f41f609607acf34a49e58580109c693"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "a9fbab3363b38990f7b4f6b9b8eb748ad4106b68974cf1eff31cf12629b174a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "2c3096db93cbf1e9c56ee9d0f13c65a7985c6c9d8b4916a470dae582246a2814"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "aa8f3ebdca3c52976ddfc14e8965afe00936f6168c96e79334da796cd4bf0514"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "58b8ce42254aab47212b5a30e1b9b538dd4444f6c07fa8f8df80ed2baaabbc4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "c8a80b672c43aa599f0edb13f025322d2029b202ad2cb7a64d416fa570a050fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "3c9015526ed48e4f0ee79c1364b836fa599a0e9aca8116e1c79db516cd2ef446"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "726fe2c8a69f82d4c8eb110c7e9f71d0c0247079aa6acecfc0ccd1aa196022b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "5381b89d410a15d1cfe727f79435e240b4aa4cc603956d2fe1f40744628119a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "a2306b71f4547e1550cb8a1990b5f03c42eda16421d62ea554c0ff6686ea7a14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "16392fd025a40d3cdf83dd4a7b75951e853a47a68a087fdf59e4bcb4c6e7f96d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "86829c72d7b4783b939fa6854688788a700aba23655a4dba9d36967ae89622ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "6848756cb2fd4b430ee327996ec1ee83a6e0099f89ee0164a664877e0b6dcdb5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "f79d53b298ac2b4631a7448e07fcbe4bc393dbe568beb2ca4aece96efe0d91c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "8a934c3666eef4f61e0a75b44703c9eec1778135bcea10d20c4cdbfe1f26c7c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "5f6c692f97d439c4664cd49f2d8a7a928164e7385fb2dbcfedc4e6fcf76a4b9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "b0eb470c9d9f890b6c308215052ab7fd94ed560b52a65c81b874dc1b7b084806"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "060e3165a487d8d51fe5125fbfb4a4da374362c4d35d811156fe0876a077fa87"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157168, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "63e7ae4dc3f3e13cc28ce53de3d1f47e2619b7bc2aff5c3ad1fe098d536e22c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "93254df545c23eb6cde50b135bf8d1c9fffe315bedd49ee5263073ed23a4c4c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 165360, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "d0a3300ae6cf2981016978bd07e60485d5353bf4acbfc27f549283ff1024a4f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "11c1cd490df31f3d749774fea7570dc8c64ea72056e8cb898a3dcd989239d324"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "21361ba684f11ec51b721b3671514634cff74141ca074be996a0fd12b7a04be2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "8471dd6e1c36f95e3a1405e783e780b7fe8a72d6e9d6cc51a533f2e0fc8cc733"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153072, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "033269134bcfd42358f3ccc299266c2eac9a60f9079a73de2c67030eda58d04e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "ff857566485ba17b0ddec2862c4bac812c40ce28cd7bef9be56af9e4cc5ca153"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "29d02c059b32afeae1a8b85a5654b3de75545886a803e26c82c12733ee7b5e57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "43bb95265b9c6a7227dc90917dbe164cf42ad4d5af76d85915b0f63fe3255269"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "20c439048dbfb8ccaa25f9c3ba530d51d6b2b3fc3bcc0ade0872614757c2bdfe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "e5d2778ccca71a75f90214465862c4b1a3d7371f871fc4748f90027bdecd9e39"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "6ef6d3cada7d81cbf9bf3f96bec0231b8e630d9ea4fff3f80a16adc577fdcafc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "077a40d07c573b1ca0cfaa50ad3a3707d69a40f43d4eb85f32f88c9e84e31014"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "66774732ab7e7f08520dc47f5e1449a28b08b03508a6f710915aba970ff6405d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "350863ec6256840fbea35fee0f2177b03e7c4cbfa95ec4da65de0e24af8a13c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "39139f6a4c9f170ba153babc59a67ba32dd98b7e0289e154be49ef734c1f869b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "820d337664894ca143dc3ea9d82705b907dbc021996c8c93b5669541bd031efe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "cea6247b2e8b6cb827658c4076351ce4ee8e63d93af26c71129f0cf0260a6c8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "9d1ddffe8c9429934731b60dd17608e55b4b1987ef4f90effd972330b74b98ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "01472c426f30c0e258180a35ee2d3391511c927ed20219c0bede1d851b10721b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "a8993824bd384813941080ee1137fc386853d8ed70071a3d1f843c74e77cf657"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "1506cc361160e7f7999700bf099698acef4afc00ef312e16bd52ab531900d599"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "d31a8ebece5aa6262c873bfe26c45101b1a75e2a1ca0fff6dc7861ae2a9870f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "2eb981e5daa4aaa9ff7a3cd4ad0a4faaf3a7634e14e5b39198da0130b25b5738"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "01bd27dacef6ea8641281760959ae777d8b5f8c6d5f99330388ec0bb0560542d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158288, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "8c05e9591a0f493de95c672bb722d84ea002550ac202fa1dea9333ca01096e5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "8d57507b8ca88a9c51248967ae472bbafbd20b51ccd5f86ae16614e6978ab0cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154064, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "f5a04476aafe5c1f6c22f0fc8f3841270ee844aad1aba559bdda925f3b293b80"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "34042b697175d4b9ec939dec39c17878c1c1fa68f84c0e53d0ef0dc53446e087"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "3c082714e0ebdf13c15f7a9ef8daa174449c45849bf20a33d817b101e4a25546"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "f52460e2294cd05b86a5ca435fe9e4e2b0bfbc29021bea40f9260c30cb4c5ae3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "4fdac7373e3a19ecede4609502dbc6f4be59c9e2ef6fbbbb6e457a789aae95e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "a627b2d6801e0f052d47efa82d71482da565de8cfac92d3993bbad582c6c4351"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "68beafcd0607ab2415c6559165deecad4ffb7e2ec75e47a424adb3ab3f54fb56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "7f99a6852e32352b1b6969c782cef5d64e800529ec0feb986722c0c3cf0bf46b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "116f2b2ba457a68e0b6ccb89719c86c885a68be1416d7940a2fec09483f5a236"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "60d66ecaae105d3d689e23112ccc3b66ee788b7a935fe43a29111981c129f769"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "47e71041f91554173f1a2ee75869d04d785e8c805880cbaa5f6929bc4796b461"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "6629865ffd2146a01581f6703f221657bed964c7274cac9e5fc5df6efaad1194"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "97f0347d3f6cf0881140686e0507fc2aa747e14bbf4e909f657b9eb7b9219917"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "bc6dc8e6f3091d29d0bac267066bf44df8dcf3e8dee9c21c92a48d02604b450b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "dc49b2feb99eefcdcdf45b957883a5d833cc257ee54bb2a2ab6c60761eb9bbdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "909f46d0467a98662fbaa6ab899f7433d1edc602eaf99d0edfeab11206783b9a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "a35e2e5abeecda300108632b62a01da813e0d5045027024545648f4c6cc5d83b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "a6ae5c717d3999e01419423fd53aae33292e86c7297e39a7cdd109ddd0877bd0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "aab3c8aae2af705b882704b8d3fed1702f6621855701608216f03eb471335437"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "c4d744456e9468ba96c10bb902496b20499387ee4f0e099437abb6e192a3f16e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157168, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "e45da99f530088e18bdf80867ef49ea7180f9c4330d35006adc8719984808564"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "34a083da990dab5b9645792586226501fe1fa88a06387385b02158094f5ac61a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 165360, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "04e061cf1e1683ea410d17b927435112b7a5de2e2e6a29b4260bb588211d81f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "f34a019029fae40e140b026b83611f2734163a1998a960a9a39ecc1f07dd5af5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "9b8063e29abf3adfa4a7f38f50614ee439eea0b0c3837b430a054e8f17e1294a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "835e74adb78f489f2ac41053a01f2b47f5e23398d4bdf9f441cf60303939f256"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153072, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "cc45347c891d3618db570baabdf7c88dea2f425db1436da723a33621ee243490"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "eeb163af167663f6b2fbbaa8cc963c525b03ae4b293fdb196c76aabf48842e5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "7647b3ca4eed54abec017ce56d7b55dc0f3c335000cc3d282bbe7f29836247b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "cd9816a2f704ac16ef863c8ecaaed3bd6520286b0c433d69dfdb91817dd593fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158288, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "a60299aff958c6ce7fcbaf04273443290c65c8e0b28c86dab5c3c53063bbffcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "9f1991aa140ab34312ba7403ea4c8c9bd8ea9376bd57f5db0b79298231daa252"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154064, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "611b9aa50b2e528df9af831d464ce2b977d106636278a04441c429394ac90beb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "d5cb00a1ce6a1da7a2e6204fc7d60a9a5ca7e160881313799223f5eac82672b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "2c2a32ca5d6527cc95e1a64fb9ac95af06d6ac230cf05fa5eff8a343846b34aa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "bc5354b7883370beb668389c815b54bbe0691cf6e300e3f953eb97ba4be426aa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "e3a8eed426a1738046b78762535a61adb35e70332f69056bac3d2601efb23e5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "4ffca8813e64203f0a946c47bc355abbba2e42d83c7223d957b43361e54cc7f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "c95c99cf26ae42b2470e0fcdd66a95afb7814a74618aaa7a1a57aca854e08f5c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "2d5d97402d2780d9a46ba990e299fb49f2568ec0d658eb49f3e40c0dca4d59e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "6d07f788c649aa81475ebe9a84a6f90e075aaef0e9d252bd1b671c6be44abe9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "233528a64df9233e16b654260c5351f3c3b5144ff433bdb72bdae3eb8d9e0507"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "774ab33b6d55d407b84561c25eafcbea8fae7e52f2c03f2cefe41a29953da587"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "7c3479e91c2ee8bcb081ce6fd781bbf45d89e5db0d67dd18ae2dfdfab77e387d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "8077b49c223d003be763ef48c80a5c27fc019f7b78949f836b9b3c0662c16f0a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "9e35131ff77ec7b7dffb0020dc63bbd7ea031e55659b53dfd4623be0e3ce57d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "88b6902ec38df4d16f1d4c6236ab2bc5b0678f7433a4dfaed9738f765dbb4fb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "bdb9d0bd7a296abbc6256bd4e4af84f874c28f109209a85393ab69800c765605"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "769cfbe444f685472df31df88e8ac2d457f38f3c1d1ae673293f7cfc17f8324b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "fb4303a4bf039c8ab3a7b3ef2691ecbd7851a0420821d79bf61dff7b7054d841"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "09187bcacc9137cd456ed38abad93248315a76e223de65181e56c2659650799f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "ceb569f099001d05f7ffc53be3c123e5d02c385f74e0213e86d0cf6118eaee55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b3d81126b96978086815a50a1ab3fdc3f35d5ea04d129e09629f07f7586673c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b71db8698dff5973b950973e68b8b279e69c887ad22b26d3f72582ee2844c320"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "7d6cdc1712f2413f16559826a60a837c541e82659a36a7d5360b5b589585cd49"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "d0145d91b47d843a0eaaaec9204057106f2702bdd69dba1075211fabacc604f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "c9306a00f275c3f29dd571b8bf5bf79030a35a19d02eeb0af87f8935a1130470"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "f17b771b76bad89a4588d2bf6c1cadd72f56acfda5adba6bda1043857baa8c93"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "948ce4eec0ed1406694e48a20a8d54ea24740dca9038a3a116607282ff740582"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "6f9af45af5297b7bc2b6cfa6aa13b48b7c621d01acca07369c7e104829f23eb8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "f8bf6f4ef393b418ef24cb7960394a9344ca5ca58267f6d55e58bf7ff974d4bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "630e7c4775363beff3cc2c2d2bb31c1c90f4d2b74759c9e986f6d76019c6af97"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "4eb9d8fca93f82c9e1aa77ac487dd15180eeb77704fe159654e9ae7ebedf1929"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "e0383a409da4154649c48b472912a59aaf7a91955605868549dfbf616033ce4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "41cc435c0c23ae200a1e59d508ecdff69fba059f5e65e09eb7d4c8a2a9989ae9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "c1db7968d45a97bf7d1284463dc44856f06b60a7797f07b6568292483d7ef8c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "4da5775c00e026e1ea9695bea161c3eb9533ce535ccec94de9ec46c3acc5683e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "dd64720a6b9c317b58114e12f07d89325a381e200a65f4b1982ea7c94fd761cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "496c46033715ab17a5e70831c9bc3069bea55a48c16819ddb4e2c5517af183d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "c05a5402e344ddedf942d0b78fec5f1186658394318d78f9b98d85dfe47d651d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "5153f8bf9202bfe07b82744ec5989b29f0710517adaf6b7e4d8852097667df0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "00bd2359e7cdc11b760959ca5929ef9fc4931a355c46ba0acde3f5c7123a9da7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "c07cfbd40db528506dc2c36fa3aa8dbe6605be0792c7efbc23f5eb5c4de21c62"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "2017b7672a733f86be0ab0092fc5128ada59803f28747665b9d366a495d78656"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "a6190c66005b70a550251feb45dbea5f5e146233076876376e45c2a2f0f1c70c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "f1d1575ac49d8ec38554f36f9a47130352e272e533085f94561a9e49c5f7a59c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "d5ba5b18cde7f8409f0f600ce0e01c85cb5b5e4e497259e42848abbddd509042"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "ad0276147d90a6627dc7cb97f6893c1e9b115a7b4367807d3ec6b308f259b255"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "c8a67da9e6f2f750434ebd8e72865b5bb2472cb4957fcf4269fda1bcd567a92a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "f9b55ea9dd89943ca3003b3f323c472624beb65493bae6b450af1806e7af401c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "bdf54a02153bf06f551671094d63877bdf43731435e4ea6a02e60bacb62fc62b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "a46d7f4c7433eb1244596234e36babf40ff24b86db3a4d46936f01d594cddee7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "be54d95e8b0a6f04589c05d523d414171aade0e85330d5f2b87bd50346476a5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "82efd043e1a2501c19ecb10d62f8867e3e8f0275afa21e9d808955f101d4b0ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "18012bf245007a146dfb12ca80233e4729031b554c0c5516b2b756d1e509fd41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "dda5a368730cf78c1d9d3ce1d0d2c43dec0e2091d9624712a8e89bec908e9a6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "c3b7034ea6300f0d910d50e1cffc54d5e137df1216ff227230741ddad0cd97ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "eb12c97a7e0dca1182f3497ebf0d3bad990dd1dfc54c5c24ecbf206c0e09bed5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "9db5015e9a97a28a4553349ec220b5593e0018d1b018e2cdc87f7797b2bbc0de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "3ce9dc4c5ab1156c5558be8391f8258be89b68bbf4a7636e31c29be090695ca2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "ffba7ba81667a7c72684f101acdf8f1f73b5cab8e013ba1ca598c98bc2240a23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "6f9662bff6c3c89b7d812e6999a2722cf9d75b9cab242ead8331b57f7d8b18a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "de887712625bfadfdc0d8dab7df9d18d398740bdd2a51b8e74c02daa90db5cce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "6f10ed07707136429ab7630892180cecff026972e3d53bfce07992b017bef956"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "fc69679d6263c27b364f1827e32232270df1b3921abf98334ad8083b0a65088b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "c2b4900a073ad03fdef10e455bf0124162447e054de3758a97cfe6520474a0bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9629caab45ece43378eb784762779043f129d5250416a1195d27eb0ccd1f702d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "ba03e7a89695f93025a4fdcaa2e3e587868a43d34b73beafcc95269dd7ff4c3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "92f3f823815b343552ebe2d14e1f9232d7f0f1910fb0b0530be2b506455fcc4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "0efd49e3e1349d20e6b870de03c71b83e12f071ae115340a1e93e5881693414e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "fe7ec5b2f993d9a9b98411558af0305c0545c581edd4a1fedc4cbe0770983689"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "86d5179182e28bbde0902f949aedc39a6239b8e53d03d5d99eade090e0d1fce3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "0c4ac6ceb4c1ca04b3daa27d972fe064d2523f41d55c109f0b5322b846b208b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "0c082f754221a7123bb273398e210ab96262f4f9f163e30b2ea26c23291a1521"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "2d5f3992a95a30e4e3a6aabc7fcfd549e2bf84acc739669a598d3a70b05e4f01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "2c13c1a210436faac413cef1cb550d0b9e4dbf34b8fee5de3ea2bda7f732e0cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "e373dcb993812177f75ba8fe331dd5ff01586b3bfd4e40ba9176601404c297eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "76cc4be37b03d585fc4d461b439751fb28dbfe3e36c6486605338dce9f8d7050"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d0409a216ebf0a9f8f60f3c017360f48e614f97a2ecffd7eae2f939cb58afbf3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "2e92359ac595c3611900c36243600e26ba255395e75cc7b43b8037d129bb3413"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "c6cccaa4390c435c141cf314a39f29460ed8171433f58fcfafc1f328195d6ecc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "dc3c084dbcc20734c699227f73acab674975ec20c2b7096feb9cf6d38a549e16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "ff401635139627518890c5d1d5c9de470e6bdc33fae617086d2cb88df3b99ccf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "16fe7dc6da9c70abcd8a9da436531cc219c8166b030dd28f156538ff6beaa2a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "4762cc92958cd90515d2a4ae75eb7d6cebe8abadf83bcab89a3ec1b454ec1680"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "24292e89017de8eeb95d8ce5885fb455302ccfd65fc8748aa865b69838c840be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "d183898aa95590dbd7b6b66ee6105c22d5c7558ceeb20207b2479839d69fe97f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "40a0739535cf371672dedd9d13bd3115e8d2c372eecc60bf52fda9b6541a3d34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "4c5d20f8388a229561e4edf9011b802fc47101e6f1ad0913248fbd272e7fc2a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "b844f73ff7950f0b19af176be5574c2cc3f7c650647d64e7baf15bd042c12121"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "ef9a909636bcf0471188d61d4c66f2da06954d5fc35e72def771b53072d5723d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "f8fc4ef6ba91be434cf2bfad6f2a6231e206c8d990b6e3fee28a656cf3130af5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "8c350f0fe15ac87f4403713872ffdd76d526f9e5e0efbd100efda4292bf04d68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "8850d9f0007052c1a06542d66e4e52e373634dfe4fdcb6a40ea2713e8e03d1de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "08f65ba0c4be17aad31b579e9ba6aa27344a74ff209a4c78a20ed94c16b18f82"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "d5e741ffd67ee44bdd5d41f68ad6144fc1ead128ecbc9961a386d7f212e3dce3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "e74b2cfb07ab028854f23799c70f5e1b57d9f1e58ce905f90cc6edcc6a363ca7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "861bfb2694e2d2f8d49e104e63f33e22d324d8c212f5d5989cb1e719516679d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "80d3f39aedf8bd391724e2ce468fb90415b7e831f4fc38230c84d66305d9936e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "dbdb2c12e64e8ec0ab1d20baee887aef22ee652211704d1f6be8c88d8d7ce3ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "622b7d9052ca7ec2229b53f68a92c912eaa20ec2d2286ed6c3a0504f78c53b6b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "fc34dd453a19bb8078a54684342737d9c93538676a7ccd60b390631a4a4f80c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "61192d7aa7669cc8e89607620e452cb9606717e57a58e2f82bdecae9d9d1244c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "70c7d1062c93f692662c76780fc6ff67eb6dbd81d90c246528ab9db9e5780b43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "fe7b8a3e82e0cad42973bb820c29c1925e9eb94d360d0075f293bcf10d57a1bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "2c7757fd4923e46c74bb3a9f7a8c5b964113e192ebd2928d5428c0016404d370"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "5923bc91df8bb9f7fe2eb1a34b1c311cf769c8f74987b9b1252639b6b2f75010"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "a98556e2bf7b2fe95a4503f13303f0192fcea6404348af2b84dd52baea7c35b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "cc360bd4936a1b68c1dc9de4af82999393856dd5269f305ded1d6d89329cb8e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "5cbe9c01564ba1dd08b4d089dc9cfa53cf6649d88cf7eaa342a967f09ff0603b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b6db979a7f144d6677ad0ea302b68a17d0c97ca416dd8dcada3260d4f0d7e829"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "d90cddc6f11ca7780c88138db7dcc938401da05d7ed5e328c9808cd16a1f19f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "a5cbf89da7eb09e33be868bda1c0ff9da654fc3238e0df77ad261441442608fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "62bd9022ab02839f66b0e9932a849f89ffee01e1a02d525b45a2f6429cd6a59b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "89b8895b9a5ffd18cd980d990dfcef8d80b9635889c580c55421a049b1f32868"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "0ea78e7622c14d9b92473d02605c85495b4b1db427b26e046715788d764bb1d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "60de4caf1849848c97945f79d8a87c8ec72a1c94f8b500824b4e7b3e016568f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "a4a7a399e10ae4bc41ecd9c181c3c9cedb1e40c06e4e6fd2ad21a560067bf3e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "de7176f27dc192ad98a0423305d263fe8c39074fc3f99827b0ae9434cd99d45f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "fb40c4b63d957ca47af3fcb46bfba05d6c19fbcad8b273d5cab494613e4b1a9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "ab9f7677ae0210770c2421577454f78f2797a8aaabc999f6ebfa6362fb55e821"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "6c5b0ee60741c19e099f6a701e0e4d5449591acb93144127f0e975729ce7f1ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "a827c4d1636c57cdff15ee47a00186b9643a77f0595b54e67a2facc37b657da6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "ce6807c0c12cbf4a466cfecba64626b1e4df58c546e8d8a754e8f33d3141af1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "5163eb71c17ef82a6ce283332496e62927076402ecd7b2cdd982a4d7771bb225"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "543ae1dbec7d7d6960c78c87869fcdff64dbef972e495442044b44412feef11d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "0ff1278582016d9a873b0a99a4147beaeee24fce0d3c0915c97de7ba75a684ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "d6ee5f521e91ac981411f0a8c82b39ec1ce6deae765d38cfff53a34233227f0b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "688e2facdeff2a0731bc0a901b632967b6700c8743d7af5ec309c303176a4cfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "21e5e8da77588ea61ed849a60688da5e47e56992f1ef0f1b8b2e4e0689059dd4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b89c2ccae9fbb7fb691a8047ff39198dd8624237a404e7f746c07be15c02a843"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b3506f0629ab6d91f901ae7fcebf2e4457ddb2121ca238e35720e48dff15eede"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "d578cd932b4f5c918e9438136326f713f7eb505ed6934e10079fc38a0994f400"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "c59e692ff0c37d82bffe97201dfbbe53cdd9286ae57713aae55a879a2e8cad8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "245fd8b0a2fdc9c4fb86b220658c91329762a2bf5d1fcd38284793b58494182e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "bd7d5c408c91f7489155ceae300173a44de67a6d5f97b2b7db0e3ae4ab8f4dfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "e1eddcc209d1fd84c36fa5942098c7d2a4aae7cba08f1e22ead368993d223850"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "b7d87bc66cf9c2c59b2e1c992461f0b6d681e414310e5759dd156c08308603b0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "f5d4df98270f585401c660c7934595387d88c5ded829ac4b710ae6a731021af6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "ca465d4ffad7775ec1ba7621b2d118b7213ce0209649ec3db21c4f4593df3303"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "4f5511c5a8d73c85d56ef689df3ce482fde9177ba90d8f05b9ef7138118609f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a4b066bfc48858751867560f22716a99b99bc27e6b91fa9962fa5078bd9da33a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "8f1cf0397460021de02cca531a1942d4dd67cf2d8c4f62359aeff7b5436518e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "7d20d146c0ee6e9e115607cdd755956ee33d61d83e80b2e0f4c8c50cdf87f787"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "b803ed713c936aed37d5f28968878f4cb264120639d2002fe3e3039e4fac13a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "89ebdd18aae7bb41fed86b564c4b23fb6229cef8cf953fc8b3e92f46131a5807"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "e0a8c0e8ff7990440b94f65e9035d983d90ead055a23e73c026e6b1895b10fec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "eeacbdd0587a935696f3879ee5769d3cb4818a7f463c2e9750d42d8a2f3d1d45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "455d2421945986782c47c1975db1540ef99393111f0e08a727278ae94b315f5b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "7e0e5b0e512f45aded9c73f13d006b0c309a966907a7fc4580da838d92f3764f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "103d23e045712847b10b027a16aff50d4a80b5ddf0ab4a9f59a0604bf6260e33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "29b1a265a847b8cd62677e11c882e8b7a746fc5472345ca3f3642b470c2cbe69"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "13ce1cf79dfd7ef4254d0c24a382fdad4806bca5ab8f0879ed26c62d1cb4b670"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "e7a0c8492eda3d4919ee4adde8ce26305608f37b4bd2ebeb72f0fe7f38b317a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e22e9443b2fbab423d868036a98ede15c92a3e7731f492fcce25ddf3a571122a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "6f7ac0f1fc09a214236dfb9ac5db888fa1a7e2b5bfdf694a91f0c588f652af34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "3676cf5f52775eb96210628bc6d34cbb1958933ed0276633dea100239e7ce35d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "677cc2865a2da0df60abe8b2dcfed9f5e35656f155b30d609f44a1e5d78c43df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "8cefcfe7aaa5db366bc4f4589a5922a9255ecb71ed615e586f62ef36ec429102"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "64d716c3c36021a266fd9f7b41f910cc9ac07b8ed76b9fe5629deb41cc9bc856"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "820b3c24d8ecdd074f31a38f6994ae083db623b0ec47bcf43bc0539c7cc50608"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "305ed8c9328be3e3ff6a09cf648abdeec910958d9368c70a9ed9b26eae54b142"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "75b69a1bf5354ca7b66c88e8f0d044f2bcfb39f5c8cdf73c602e0ad54b61f6a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "bee3ce035751b8da0d6d26d9ec3dc6203981de12cbf1d04fb047338311d82f64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "76a938422b978b1340447f362753214169cc68bf6e666516dc9a3b56395306da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "d39a6e7467c4821baf7dbab3cc584301c1cd232816b497257143f3fefa3e4d9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "ca57ef16322355f777d39f6729512b950533611b488577251dcb2707edbf2f8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "adbabaf2c525a3efcbb18206b2309e14937000bf0d65de0c88f2745770656e34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "3fffd7015dbb1024c30695ebdab4280f0ec0ad914867615d97b38f903601324e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "2d528e6aa16546dee87a90a3652909473eeaf74645b714395999f4d3c6f7b82e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "32a620154a6b25b692fdb9811125981c3dd420393d47528759bf661d074a5d1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "0f39a5ae31c3a2cebb9242f4a277544a8c5bd073abf8be320a6a5077362d5d97"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "b163321cae495b976934fbdb9149583de6f6f52ef7aa78fc2ac863e8b8b7d843"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "c3090c70db329e27039c800d8835824fc707c47ce7925e08a61e3fab905d1ad9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "4f471870d3a63950e91e271ae3ce842b7a79458513139f9ab376a9bee98e705b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "04794a79e9ebb94c6a2f296796732eca5f34799543298ef263f075063fd2c1c9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "6463472f140c30eaca463e7e49046c97d23d0c4dd56fd44427ad3ab94aac2d64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "286de0c9f3a5f13dcbb1ace76e6f9bf61afd4aa70a892f48ba2f81351b32f997"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "048e71a97d23ecc3f75d0ad579864ac20d2a2a900bd9e05d33f7c14c08d92e99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "c9d4bc3fef194e019b00093b59126b9133fcb498f1acfc97862622cc7ecb3dad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "6f8273f24b6bc17db6f34f50f345085130fc9dc0afcddb27dd6457fc411d0bac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "cbacabdb90ac663532d91e343719c33eb9001678884d42275fe84463f7123a50"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e540e1923eb984a7b8c210417d76b0e01b04488dd77958b9f6b292d42bb7ef24"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "99efb3294135acb237a1208e8ad421f165e9fdb0e2e9137f6af7a373baa49b58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "2cf1877d8a12b00903824ed171b3bb0a08f0fc0c27893a4244a7c8578721bbfd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "7d46bfdc5aee775ac69d230eac8ce8bb24929f373f7edf2ccc6d9685fac5923d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "cec194495a4fc1a4ea6ee1d05d986b09ee544d8f44be0674ad2156bcf953b93d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "51b6847c4f13b90908a2df5c6843da7ce079c54f84c8a6c8e7ab5385a504f087"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "efe8977f3f29105caf5eb2da9984d541e73bbfa397d991bc90fd99ccc363bdc7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "2ff50ffd09a198c4c7b3fc4f96bb66d494b2ba2922e3c1eb4b8f29a02c58ffe9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "c2de1f35c8dafee2c03429787cd9ea894d7443853e57f31469e7ab39542cec95"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "a68b3d4fcd789664b9766f3676f4bde4015eed4982211f9c701459d0b7d1bf7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "1bb9cfff2b3b0928ee442bbab796d56270a8bff23d48f8aef032f8f8b91c395b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "2195b35fe2c22cc2c5d55c05650c748c94934bc975d472a43c202fe3b04e22a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "90446bbee24dd698e0a2e16b7831da9047386ec8b8ad8188a05e8f30bf2dd463"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "7e56b72b6834217cd2f684298562a8e0d84625641af4dfea7f0a7a5af6a66d31"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "6782f7b86b6cec446e7fcd29b72a19bff2ae345b5054ce12b0fd3d739528ffee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "55884824ee6a46565cfaa7175240ec49a745736aad1e56abdca1d9d731b51e72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "d9455f88fdcb8c1bb3a188fa3e334cf7189930c3e851521940da4f234f9ef6c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "cc0762778425a0c0ba4d6ee7cf4e49a7be983a23378a55748ba2978e511149f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "6f1dc59f2873a8432d2efd71056a31931965224dd349666d71550b0fbf5fdbab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "4a62bb22583822929da7b4dfc433d46621445fc6e3a341e8454c1e3b85109036"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "8e1481dbd980e3a67a9eff13dbdd360a8793411da6820257da9d21f61080d2f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "98b60566cf0b35937604d7dede8e7dd40f5051c60e22e1df35f98f58e9431d09"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "86ff6dd7d3cb4881bbaee020129f3ebd83fcfd65665998fa0dca85e893741f53"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "beb11e86887cec5765df1241136a24ee6760e157934732ec8ba65df1255a6ae8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d7abc53f8341f86e94cf1ed96777b546bc1022cd5bdc7a441d50042f8410a9d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "f13bef05c6eb3942c7cb948b61b703c812bef617ad5d36d5b1df95112d3cfdda"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "d37091ded08df976374ea9db30c42a70513a41f3e32884bc9a7d556a55f91d6c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "7a8d3920ccbd043e577ef39b8a7a4d4a02b784f38cf0f4053660e6eeebecaaae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "4c50b4fa54526d94f07984ad84f01dd92b4b6d637c8474a68aba4477da16a64f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "009d829ff5daafa9b944dcc62b4cee43b93228eef93d53186c45affe1c050491"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "261d4c4678be82c395de07c119a521fd32fb5d9c87a311caa2cb81ef3a4d4d91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "191e8b93e214107fdb3a1df3846136e8a1cba0504ac1d5d0ff2bf48ec8598861"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "8fceafefb1dccf42478633fcc4fbb764b7b33a065286dcac6a1f8b1519b68f5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "c3778735763f786c030f7bd2c1078bda3c7e6a118171db3fe7449514c409d87d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8aa04b03c8e204bd75967191243be80cb55f093f5317787712b922669569ea98"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "03e6b8e68e0a470556e7a2493df394a3da8317a0d1f70aa1ef30d8dffd7af575"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "1584d045f918c30644ea8ac69d4c3d692998d8d26eb959f58f56946046595e02"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "4b50ed51f680f7875e637c53ff052b717c3015a5b43d2c661fd062cb9b54e8c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "269ecb7ec22167da983fff26f9276f1b678a5ebbfe13c0aebfa177a8736a32f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e3b8fb37b79f9b09e9a4d5809a5fd0cd99cea62eb847ac2908ae2879a7ecae71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "c07d5df1e4c0366ae07ce2f42dc73b2caf7d7eba29c43f68eed5b6f604c1bd28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "4e260a4865812d0207177d5dc454b14f82358deb127f42cdb8de04fbd96f36ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "22799bc43040d986a0b6bd3732ffb4ade9e0218932abb82bed3b421e837afa68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "302abe034d8a2760194596c41820191cbf49dc3de6cefc3112a0b1de50fabcf8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5856ab5f6a87975d30f05ac1c70ebc4f6d9d686f1edd14fd3d0a2d1395e7a775"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "3eb5f55bf348e179353628c5e79e4f636cba7f42de20ee960dfbd881a0f7a4f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "8464ea22e0fe2feb7946f08178e42325d89254d948a62e36a8cd7dc2ff5df9b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "eb4208e206fb3cb476d2b9b1b8413d461a1bb28a1e47f5ec32bec10ebb02933a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a78a1294a60c48fab26e7668f2d253a843d661ade870b4d597b0d84d85cfbb45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "4408b9bdef6b2d2e91d11bae130ef873ebab1668b0376f1ef2817a132b4f16c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "8013a6100d4c1d8ef2173fa7d0e8ee8a0de529f56b718295ff26c93885ca6e4b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "4548304083103f858367df5a4ac458c466ef665867a468265ab237e5ac77c3ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "27b9a9ae46315301c009b3a59e1dafc5740fc204a2daee6c55388606e655be88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "c1d509ef1a60219f66c03372996be1d2435149291703c0d662ed2795994274cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "4ac0900793e73a7ebe4d3b02c0e84acca94612a190052061b4f2a7faf4a35ba0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "f5906f1bde7dffccdd5a99ce81081e7d1853a7ba4519d58c60b1a1be3f81c014"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "0e9360384a86256315abb1fd98ba1278b47baad3f9a0d9aab75b603f28257efd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "ab76a7e7ea3175330ea386e447282bd0a0223d8e4bda2e9378b87a920d5ec3a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "58bfbdc2d4867dbfd50c529cf7a53415b5f0224b377cdf068f46fd4d0a5394b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "931db573f51be56dd5c902286db0af4c092b94dc23ac97cc74c018e0dcf10af7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "602bd4343a98c1d32e4851dccd460e069e20ca55c8b0819410c881a3f739f9d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "0549850915da2da6cf87e442349ea9f1487d11e405267feb97b4024d3bf82833"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "36c0af15e3dce7e9e8a1f3163b99d8dad09ca5e564066fad737c71a0ffa6314d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "6dfeaa65e89a134c18d6e84dc3cdb197a9f2be3aa0ef72dfaa4699ab8bd2d390"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "7532c6eeac2353a637c2445eacf4435a68bd3a73ebd2a0ade54dc081d13ab7d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "8065fd958e47825fd67b3974652fe89ef1e247b066c5832ddf092eb3eb49cf6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "ff1678e355c5ff65557d57634baf069f810c8252dfd8b8c2424757e60b7b0ebe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "e9e947637b5fe3f4443d1c4486d5444e2007e5a130891823556b474d7b3a0b30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "c7025a70afdddfac285c74e5d196d4ae192d2cb2ac651ee2b887a7737e528bb8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "7d563d25449c448221f75590ec9df20b3e316d7af489b43a9c830efce802e077"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "2d1d346da92abc15823a31bd29c8e52568bb8430c649c969c7d82ca906a7d245"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "fc9904f6df6cc7be92f8257578651ed4d6bc04f64bfb07c2643e8c84ff5d31c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "d4117320642ec52d7243bef3fb5ce22fd599ec7e461d5f45dbf7c062c2d6807e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "c4f0425ddf3f30872fd5ad206de89a9f01b29a2e0081c4638fc93bea074a3981"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "b8c5f07d11ba87fd47f228261ac7fdbc11861fbd5f20ffc5bbbca5aa07922f91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "505addd48075a23ed626f6603f8b1fc4438a4217e63d3987be2ee52e58753116"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "a9576b2d7f0da449f901a9924c130c26ea03308ef750dc58578d27bddaae997d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "9536b3f98ca03a02f9645e3e76d7a9204b02150298a93ecde3f1cc4bb660d51e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "c31bde0df951932fbecae2d89d4c551d81cec76596062d008f77d202f90a90a6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "fa4d3bfb6a540d309df0a6a6dd32ca2cb92a6282936e26033fdf358abd116da8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "9df99c54b2efab5bebd1fd1d0e3999ed477a71020505b4f787dea98cbe889d4f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "1a82296ab1740ebeec8fd8954b183074dddd89c4583789420f4caa6dbbf17b85"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "9f4fc9842db75596ce2d9e7e938d66a511a95c87191374fa37ac9a85a94a6de9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "30b95d13a727b58b3cf53d2eaa987dc878da2bef1d67af151c1a9342de91fa1b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "c62df59cf4b230e838499ebccdc0de4beeaa403812a69ebe4f605425a04ab515"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "5719d8663d3f99ff4dff212eaf253155d77c9e72c6b3c4eedf5f16ab462dd873"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "23c4b90ea73192a4c2f80fb066fc8180c29f3594a31150aa83226ebced9ddb44"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "a5fd158208d706208ebf892900e4479517dd6f37d6387b964fd89d6e49614694"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "7ea15b9a0b482c92852946762c289d86a49849b12bdbeec004b14141cd342ca0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "66282a25bc6f0d62feea6ce813dfbdd69a73b9d9028099d0b5bcb022ef232f73"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "2746a1d7fb57d484eef574579e8da70673d174573fdc09cc17362105b81cff91"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "f119c7f6b4419509feff271f3a7d9ecc0a88890d4ec112246e1276d80101e2ed"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "c34fe6d04eaf1a1cff3a692c58ff3ad50fa8c7afe6269cb6595878bdfbe6e51a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "2e61ffa1c6bb9233f4ce49b0a68a2826b623bec064861b3351d6086fa23a0a6b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "e215bd846fa9ac4cdc6299fa63e3dce40748c9eeb27133672eb9c9cfeba4d0cd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "7dfc7abcf53f971193e0dcfdf2a39ad832cbce5510f570a21858f2dbd86d7f99"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "9bf08a88a8fd634acb0e7160b29b7d7979749664b7dd92733b1777c6c92e3594"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "1d303817f3a8fdaf4c2791fe57be9a460d1eebcd0560d44f6b92e5d366b3ff7a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "02a56e81eccea68c5594ae0c457a8e0bd4d8d0385b10edf37e114bd4554f933f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "64892afc26a975ff6cd733139765bde21a4f45b4e66d055e041a8562da04b203"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "e393546e0c6e0bc2d58d0b2681165d32f5308a3c09edbe1a995755715f383f31"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "6fcba903dcc71ade037e0d42494df74886d251c8d0285216b12767c4deb76684"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "141d9833a5e82bbd24540b42ae5ccd2f0729b217a8da870c6bbad47a422a785a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "cb426a2b0be3c476428d7577c8aa7e8cc3f89a90301bd74d98b2f878919f6850"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "1911d6c071f0ed884ac8e716eae111d52fc5a51cf6e1d10a4f7e04774a984dcc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "3b751dc24ba1fc939e22d009fbb4a83399e8bac5c166e0b74e93cf55b931b708"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "c6a7dcf43828d674e4b4bc625d7487c02f82a950cd733beba5a6481c6d820acb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "f89c9865d2791e8baa99c4d9c66ead313c2ed425f7cc2636a9f81de3fda997b2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "9f111c358d9b7fb73ba12057c7edaa6f4832668d77e3d1050952fdf61e4ae83b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "044ec85e8ac9df15492837f4bd85ea6c1a652d64b8c5cd9bd29135ba08041fa2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "d2fe373aa922aad5e94639356da7f1b4d4870a34d2178904fb4c0541061f3b1e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "57e754a99c4d440ea04f7a68a49ec69e2bdd6c5850a302056e6416f4c1e3c1ad"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d0fb57776cf118937fa0cc0a59cd915791854cf0a4dc31a5635d153018bf39bc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "e7b8f70d83a9cf531fcaad97918ccf6347d57d99e248f7c6124d81684062c7d3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "5817552e3aab120693f6f45673ec5abb5986e3345702fe91c772069a4a6cd2f8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164992, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "7acad8f29e27b192fa293de3ed3a61abb00e152335312729b2863a54a7e99d3d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "ef3cf4460aa324f746a148e8a6ed9fc81e14606b3f50783872946cae53e67356"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "7bc53a7f63b942950ef61d3de678bff68a61f3f6c3a4ae728df5b31b87c5cefa"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "1975f00cf4c29f7a4218060e9ab7996addb308f7ffb721c34e2d8c62e435c5ad"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "777e93c76e59328e3ae57edc1afa051fc2d1be7854382f22e06a15449db6467e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197920, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "2a543255872aa94ab1791ab14124606d3fb2c94ed056b8aae865b1f91608fb39"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "ace8108556a2a798d5cb1e52f151ee95b985fac51d5437ee69c42fe6396d643b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b84c39cb76748929d719921cdf7b01bdf558b0f1296ed2a40372169ffe944f7a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "509f7571e6b71f0bbd130c06e370518cf20e1f6a5e19b4698d018927d206b7c5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "bea90b7a2835c92c50f52b75c4c404e7979a4db79a485134d97efb8d8130a5c4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "d9ab528ea0d003b46f704472b307a4252b601b6a60b441a06a3a70e01c50b89c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "a27a4fb241db99066842c499df08b6b5b44a0eb268ae76a8ef10202ee514e309"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "d9ae465f6e01fa297ca854a6f3b410e3f4fab1c8fbca20fb1db1a89d534c53fb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "91d37de5762afe2b09e9fb351975e7c01261b49df017bc015e581e5a200b85ed"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "12931d0323fd17b7234ad0e148de75dd7de6102ac24341c74bb19b769ed5f0e1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "1d2b2fab2f6e3984623466e8b94accde0aa5dcdf8597fbeecbf2f0da2d7c06e6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "ce25af6230e0d07a50b2bf9c4dab645e4eed528630a2e9d6cf953f3cd8d80240"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "a2f9a769c0086285086fc913282f64654f787f8ad9ed04b24ad5ccef3bb604c6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "7fdec946d138ab3cf7be473f871b8dc2e61fa1dfa2aeb8925fbcffeb0f1b6fc4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "71ee04372c355dbd2cd9838c24f13f38e50beb7d3c9a8ca0ab2e8494eac20780"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "be3533ee5df6fe1093c3e42bc8439e2007a66e2215dfd03cc0385da0bd7c075c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "62af489362cbc97a3a46d2771850a3ff018440b74e3ae64f701191d380ddb201"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "f36f06e16ca9cd17126202918113d3e42be019bdd61a80e9ca55755245248433"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "9ffa7e259aaad764024a4a58b2d73a5576ae1ffed4daf5ea40e1091be283f463"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "fc0c96b93ff43996216b402f051859ebeb88982ca8e423603d4be76c2dfb892a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "dd2cf544922648385fe2a9d36b6590cc911cef5e312800aee95e415996d6c9d8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "65521fcb7bccb33eab1bcb6d65db3fb5556d0d84cf2bcf3345805c18914dd25e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "8d77e88250d7518dee5c2949ccaa49f95cc66ed0f2ebc4093ff0d7fb2c55fbc2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "528f11574a663181c2f6d9ba8b57b76caf94ad917e7d6fa8943b04b20f01e608"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "157cb5feb1eb1bb2d4328af396048c4cd504a62a622ad561fb1a8456a95256b6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "5fb7d86cfa0a13b363c19f2486d3ae2dd679b89fa0c80309252e270dee8b058e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "06fec04416bc9ed1dbe172091360d74ba515ea1884d46fc1aee3ac5d6a40f1a9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "4391265ed89123f37964d9414d7676f49200e0c69f4b30e108829dafe7e8da9a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "92d06131b0f67e5644a96134e98336646cb1f75bf26a53fe92d3f28c9c9086de"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "61a72bafdbc2c039f5925b08f49b4d17119fbd888cb8941323e9ded7746f0aed"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "c408708063e7db1a6e39afa0ceb44f36a767e882513eec1a5ced3e0bc608af13"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "9f997b0922faa8da472bf7a96d44608b09d067c657cda056be875facd700e56a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "e7a766591d8cebeebfd79fc0b7837233881240c6bb54e5612c060676d58c715d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "267844730434270f0f488fbb5cff86f29206a632589a3c60ac8e0055a100f183"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "3a218bd1820fb2efd81d5c67ac7d087b20ab7a4da7387f94d417a554ad25c47a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "4a5b3faf76e4b52e3ed64bdba78ce877653410006e825d34a4584ddce8412027"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "afa3a9e1888f4227d8fa03d1875f840d72b94a1d5768168092c58807f49ce9d0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "8122a291faa135387a8aced6e52131ab3ed208e79a96a052366589e77bd83265"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "0c7d6cc726a6fe6f45a082cb5c815b30fcf82a050111ffe1825f7ae6a7de4cce"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "e6f6b8e8cee0d0516d153254af3802af7e25eb83205afc7fec00e3ec25391f72"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "d2099f1e3dffaa1faff87cad8fad129dc7ed0d2717726c31c5f0cba1156d3322"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "324638f0209975eba71123553c6682c72ac6f0e6670da849c13888d8262eff30"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "5ed9eafbeb8186249a07b6f10dd74413f61857e89e2b5c21a358da4704382a39"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "708e1e07b4c1070e7ff3aa5520e3dd041050fdec57ff94262e3741234d06ea2d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "22c04dca1e88372d0d5531a694bded106a2436b5a785612f87a705fe758ec6aa"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "ff7ebbea3fe2363bae5d7b0f005b23bde9f98928b53a4fddf9d9f045facc17c9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "10b2d9c425a5fce89854dffd3b2e5bb07371a8c12fb6ea766463495e472658e3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "d9645bf61090c382dd113e881a3e05e15238744e47c1f1aee79c4504c2dbaa84"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "c9810ccae9bcc14bda6ada9997ab9bfa227210bfbfec8027883c0ad8455f5035"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "a5e19fc53c110c952c4a01d1090e94b35920394010df4f0209e7f13d2cf7390a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "02cc90ec4fe13cae78318b8329683cd217687ad6b69f064254f25f8e8f014623"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "777bb8ac154f9907be9d5c10758b0af56fceab4b8fa6f21073339b30c6453c5a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "46bfbc6cb123bba0ded5b8df1f3ee931d5fa381bcb882411dc683b1f6a4dbe14"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "7a30278d6f681153e9768b829c5d00059f27b98561b2900f9f41bb4342f83f78"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c4ad78cebe0b455c98679f9f35207f24489e84e358a2bb9124b0f81ca748bde8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "733d64e078a2a3c7ce48d7188316046b4773633b54a89c57f1ba140fa71a8052"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "1819103d9bce014810358148a35b6b320ed3663e2d8d4a63824401a081c5e7e0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "3adbe3931245dea48d7d4242c85abe0ebb42ac7d60fffb9da63b8398838e9f81"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "7bbd34b2698d890fd79be6ff789a2bf603cb24201ce36868543b83c6b26d4980"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "d0b6ab6030d6a9638d3087b89c506a33d2320e76f3cd85d378ad31dff3a034e2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "303584d683cf493934db934d0c01de46273ea65dfd7e5f2770f7b04dc60ab1ac"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "522a642f4c2686b8172825e791843269a98baa2c13b097829c46d6d55d8a3c66"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "d507be22bdbe9d0dbf667e7c3fce804700ec55aca31564506bb83981852d0950"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "ee30498b537bb755f781040918d3f13cf116ab9cf9e5588fb02203823c9e289c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "292108e4d395668d1bc896bde28fdf82d602ffa20ee5b71b417bb3389908bf6c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b4a8fd9236144f0ba06dda18d25afa75914977e443055895a1abf2d195d8d6e5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "6d1fdcbd0b71d41560a98fc52d57007a08b89b3a38e029252654ca5aef9a8380"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "d1b2e0eedd8dc43ef653bee2d0f0894b5cc5104ff214d62e0aa42324fa9ecc15"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "f1e7fb767cf375a5caa17221bcfef4a4fd53247ffaa6da23d39d2b3749241712"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "ebc9abbbc841543ca731db2b0aa4cf4e94df41275b4b855271bf1ef0329e8ef1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8e2420732c1edb06a343d75f514b0c85d37d059f65cc80f40bdbb4b54a1d2f5c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "cea40e6ee2cce1f9986d1527fa604e97f0eb05489a1e5a693cf51dd33430d69a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "ecfa25acb0f8ecf520246fd002ee801b85780a8c96443ef1cefdf8f69949dd02"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "f7b77b27c0d907bb24aba6aa6ec785f8c62451f749b3f16e20030a63fec773cb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "76fe9b5a7b08c9f06bea0528bf56aa5c327fa1e9dd6b6685aa405f4315249735"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "574cf4e6e4e25095b97564e1cbd882d6a10b9b6ee0ec12fc776606f5ae560e06"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "4dcc19a045f633cd692ac99b3a701def0784c8e4038a0d3e3516695befb0de79"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "aa0ac490a17913a5209fdd65e39191a0cd14043762d386d9ef17c22a28a2cad0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "3509ba7b5abc02d31edbd0285beeacd6d730585bcc6ec7792f025457700eff2a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "dc21586c50164e956f9dc34f6a269424e926b1813190dee007437ca21de80837"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "b69c67e699b4e78ee5d1a37f520452121c778d161f7894954094c1fbe2738595"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "c7b5c38fb22a105575b45e05bbdec70f0613b88718a0b0814709cd642c476634"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "f4e253f98ef703f16978cea8ae675ad9bc514a87422b00d1b75595f2c39b1cbc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "d709b8a1fd796c12feaaac6001bd4e303b1032c67a559ccfa83001dc2003d30c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "c05a944eb565b0e57b88d48358d6d31c0e8355e3487e4e7272c52d78914061b5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "65a73cf31c05dfed35f7f238e2f67788ae263e704feaf4a2df875a16a0fa1d70"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "bcd6b7300dbef954463d4cc526034b1a8a105b0de3895d5786b23ae15863670e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "c7994228148352a5497f3d68026fcc28dd82d24f23e08677afaf81491d3ff9b7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "46957e181eb4b0ff2a0432d2464ad06cc69629ad16ae0a6ee4c561aa1d0c2770"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "692d51a7e7e7201f5ac86929ca8dcef3b20429807fff11655824b034d90f12d8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "4067686fc959e068f43f3ca92289a28c0b7b62e1dc3883d92be62da9cac81274"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "7f0e70b125e2ffb654c56057d20bacc081c89e00798c5bcee636d4a72ce670db"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "9b87a8ae948d47b0bd361b008e5cac2cfdf6e122acbe88c922d684fc7e5d9942"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "796eb33506c5baed1dbd2b53e594fd6f3584377a91ac0785c7042c15572129a1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "f7b30ba722194dcf5ea090c9415fa028b9770bf9734f072c1805a66dd5679d22"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "0aebceee86f7907800f7c1794e73b84adc9b1caf1b62fae3cb5cd088c253da67"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "40e98b0b12b9e631fd993c498e44eef2256075db25eb96bbb6ee6b28565ee02e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "27074882d536187540f5f372d9a2d8ea85360775a7e07b0fa3fef7a10102004d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "fcd3da7cb7d851277bcb90d43b6883bf9df707b129a645bce1a224d88c53ea5f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9afcc9253ddf3ad9f14804da24387d873bd2986ed52b174623b93970aedf75be"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "da7a39beae1b6e39d9fe5ee04a12fd78db81d900b5cddbabdd3bbd320cef61c4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "0e85ec8c5321328d4e5c39a65f7729726c8247e9494c44e7b9785b28f0d520c2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "41d9e71183dccf3836e522c96d0c8d9acc3c2efef900cfd8698502fffe270aca"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "9ba4812bc395985bea396ecf3fcbf8bcd8e8ec37fb28ddef8219403afe001b6d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "458ba4ba994b71350d688aaeb31b8c4a1d2a4a520e3f2876522a5f23185aad36"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "5ca1757ce4a8989d0c4738585fe7d684017ca8496546ed1481cb12af7aac17c0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "3afd63f7221f561aa8868fd02d639494b2edcff8d349f6b7520759a3de5f0da5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "f4527df8637c9dddcaf856c78f08d6b67f4d59f0efd6dba012d2bcdc3df78eb7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "908c186fd7511df6b370e5ca900479fa109d636c71114d203f477c9c73cf9f14"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "47865c857b712d5989043e594219fc29cf411c39016e2cd7515aca422fdd62b7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "4660da03de67981542d0f21ac2321b192c7d1077088465bded431d3587f2e47a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "d050c396ff8a2b781b85d73bbec7a42e9b10826dac5f6d59364a9e7bddb97904"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "3bfeb1e45138c21c7304cff392284e604c43f8b62e4b195155c219e468fdcaf5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "591b4311571a27b99f4229d16c454e35206e12948c7d0f9d5ef3d5936acaef96"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "c893a5611c55b848329ea5469107230c01a9b81ab5087f29350decb9a60f9dcc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "4138a4538f725a6d63a73df4b97dda358eed376077c2308d0a59bc38a9ff2acd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "949736fb84868e389a24ee1199fbb9fbeab8d4bf34799d027e6e43973f4ae65e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "eeef2986bfb85c0c44cb6c0e3b704241521aefdb9ace2800f62f5e20b61b9b66"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f6e7282897aa8edb3a2f814fec2508a9a8a6bc1a3d8eed83f189439a68374505"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "a543c36a905ecb3aa653314693e2023859c59b0a8fb34e4fa5956f66ff223abf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "d9534a34001117cfcf987449217a28166d0b9be83f3ff9458cb663d5884095e4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "4f860bc28b290424c75be9f946199874e0cc2bcef02f59646bf24422058a5640"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "6dafad0525daad008f392c1d5e3dbb8ec60b0343b8a764bcaa035c0ef4ecb8a6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "bf64a9b27b737a7845f4173da19ff2d1b05c99eb782430a824673471175b8b9d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "7c9fe9c3068b903697805561984e73d78d28c1abc1743d8ee9ded3e3bc3bde71"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "ef9f867590a2ab3ceaf712c3ab52bad3393c74f0910405efd906a1263c4915cb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "0cbd89f8981af0348589c4c2128b2c12136c81769b18f9bf7d481220f8fe90ff"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "e16645414ac4e4e5964bd4e63e5b4a12dc877c539965a271f417731da824e503"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "705c074bdfa1a8a565837a96cd1ce7c6d0973fceefacb3e9167cc6e5000572ed"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "6992106127f9daa1dc2287e12f2333310d49e187ca562b878a61702380e3ef24"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "21b83fdcb53017552c1b0de15cd6b28c56d622dfcd08a5dbef52b9bda141ffd4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "05f18b5af25c6a520ae16b23615d463bc2150fe7caae604c87da4fe7468b99ed"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "060473049f6c794b0b3cd5c2506f176e066e6be64f5c18087ad8983220ea5f58"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "70ad92709632a45cc5af12514496b378d5623cb052771c301c508464d0966273"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "50faa967aa4e77484dd7acd65b311be6380bbb011da858c5dd335dc3629ed0f2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "93bba920e6b432daf9597f97afc38d5c0fa625416f7c31a8a2caa3b27e324c57"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "19d97cc314f44481e1cc8453dd12f2320c873afd281e8971f410256f17475de0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "b6c9cb1ee577c45f99b97b5e658e5901774ce9978d356a4fc6436db0bbd1e422"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "277901588c9df647ac9a8c876a9b82d61904396fd7c2d3801b49b91fbc59ab93"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "b8d79d9f3b9ecbdd20dcff8c7b9a6589b80003fb08aa646f9d982fbe84eabeb7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 210104, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "2efd2c5699b35a274230edac78677f97fc2be7b7208f911db860f0a6665ec088"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "66845aa0fb274fb3848519fcba82012b92d7aa442e745ee64665511c19df3e4c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "52e9f089303ec84e6adecde8a64a9c9b77840b0bb452f61ef94b52f2381df8f7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "aad72453a124cf522051e9662b1de70b5a4b9dd1efeb7fa0fa278f79a7ed6e2e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "92339aa7e915fb0f5351af9ddc539844c6e165f875ca8dad3c4f374fb09102a5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "3852e9ec1881607c4df9805c3d8158a8de7cc3225925bc03f42c8dde29bb8a10"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "35cf7696a1b709fdb5b21f996de3c81f040d0889bfdb27e9de1cb35e92f85044"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "2e5ab1bc619b02a19ea6bc25627cb0147ede5e90e1c4507eb186ab6856bf9633"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "f7200b700dfa0e889f8f0e7cdfc665183ed64886d8dfdc66feb8edbb9f0e20c5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "4251af5a935824c3a8eeea4aabfd8ee8d5918303045126fda089e0ec989cdc6e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "97f87ea64e84eb6050375021e12617fec439efd54c606b616134117272653b53"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "b5441eb8c46dfba8d4addad604d2360c3a996fb034f790f1567da4fbe7ec1cb3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "55f9966f112d38a59eb293f1301988753e232938794105a54e5e308d76e32e05"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "19d06c62ff80e69e57a0233375aee8c4850e4afeee892de4410498ad83ff542d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "1936970b3f43117493788df08f89a23d17a13d7b126308d3c454bddff0cfd684"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "7ccefb53d0514f0f681e34cd8e5f373ddaedc47aec00b841c74b1e344a8a8efc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "d685011d90a426c1b3d708b6688576da3a587eeff13fb388ea35aee56e18317e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "0a6521e3e2b2811507bd0989e238d5a6c50bfc8164249d210d2ac2ad67fd0a93"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "840e9b289f9812b334a883382198feb3037faa7794c40c76578436a77ae4c3ba"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "5a731bbdb505ead7e85b9e9a6058f5955cd2cc4958e281f8fb2cd6c051a8831f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "44db4c595f9aaab956383ad569f4c3f1214b26e9bce2d87c22a0bb83f27a7921"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "117515778fca13a55758b789d47a31229d40a763fb62aa54bff9d413e698af1f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "4109afb53fc958af35a463c5c7e21014c2872f9ee6855a37bbd02a633da81917"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "a37ed123500ddb0fed6d57fb04d852868470cb06ad3e2e3444277ff896582181"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "a4a6706464308f44f91957f940b8252a424b338809f282c40971f202cf2aa2ce"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "c487dad787aeee935cb5cbacc81c1127feaa194571e9377031c49e45ea290ad6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "d11890934725308610916778e4e295095a11b961394eb59a9a8ac37214b401f3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "f6054df4e40d36d00fe47a341c3125dce9444824888a17de784928f2ab76dbd3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "33f8e3d8aa0f6bc109fd05c284e8a72b6854086a6c7f05dcc4e03affeef791e0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "f2dcfa1c4e1d13f641dcd43d84f3e2e850385bdcdc722305af8ece3c93fdd376"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "f7d5c55bb59c7e1c9f92490b6dfdda83e676b8d0ab357af372d59a518ce665e4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "033d1fa624190961ad4dd9152dbe841d3064d4267f6e8311304ebfab3084f096"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "65e9096bef0fb43e1fbaf054ea06d7bda13a3426285f10850c21c6c08c3572e1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "9a554d3c382bdbfbb34b8edbfc37e7fd87b36e8d42356faaef15645dbb37ce60"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "74d87bc63b4124b8f562d93eaa6f8232a4f5907d3f55b1dff042cc0ee0409ac4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "10a602225801534833dbd5c41f14d461bb9f558e73ac9735ffdb5871c89f52fb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "013612d1b2ac8749a6ad18e9e4be376f282a880b781ffef9eefbaa07490e5a3a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "57f9b1421fd55a99b4e9a290a1b38dce2b2d8a5a641dce024182222dbac80044"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "55860e02ec851d78feb430454305521a68581c6faef0a607dc9882f5fc20cc44"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "134ddfd4d1c7c8625af7d281a3bb477340006dcdcad36d57b180a8b4d22e8472"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "b28234a2e9301150a5dd2eb762c080444750b06ab690d58f299f186e4bd85b53"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "46ee76229822477d6593653543a67ac20644ce6e701b929854378642d4563c3c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "58e39217a707eee1759654d2498b6999c640f43ff4eebedd50697c840c0c6e86"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "e856e20ce5b8b397ae8edb85fcf4db6178386bbe3a45b823cf554f3310c5e7e3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 210104, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a7ac43288709d2a42e53df8f093202a8db7c75b1870efc2b8b438e8043a98e2a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "f3ec32987731f17df7d61681a649be6f222d29cda50fad212371933e86b4b502"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "3888400064b479e82a60d5de4a20d5f7872e3df7c9a0be14e4e155b3d7c28738"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "2f342efae600d6e1f72cf43c1d3ab645fba6a53e3991d9d18ba156c3345a9b63"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "554d579edb18b80f7a1f9c723beeedba3ad74697832770d43f2d413e41dabc58"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "33ae00c720f7d316e4d8ba8a719b094778f8e676dd54457abf0853ab8f59d077"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "55ec96afd1a15ae697dc89d5f4f7359947945146de98d25443cacaf1fec6645b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "0ea2394d2b653225baf125813c7389f06ea9eb0f705f7cfa9ca3a5a32e3d1de9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "d62cc50a6859c818a11cb01d319cdb0081b017fb88247bb79fb0e9845ead81a4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "dbb20a16a0a7734f68b82e9be8bc3661a8ba7abc925cbda7c977f91ff3655978"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "09595d52dca247653c91d59ac14044d6e1437ecfebd3a22dc377390ef1ca8617"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "a66ec6ebbcade92cda1a8ff4de28ec95cfcd0134fa776066b8f2e642ba6e3632"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "71ef06f620acf3cbf71a2cba6f68236065929a7d3fdc05edeab7d7c19c9f7aa2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "1654a7aed18d37fe8c5f0536cc459a064796a8661bd2c7ab6e08ad72f4b59d59"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "6be66d7c23e7fcee0f255335624bc83dd82a02744a4aac5eff8fe2cfd011e242"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "d300f33db1f1d8182d5b902e7eba0882bb014b613bc981e3862206e5d7052e7f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "1eee8fff5e517f7f829bb02cc71a5cfcc632ba9e12cc60ab488e114f1b2648c2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "b272c8cb57d1bc1bc5357f5580332420b021eac73fce8f5165c7c4d71aada4dd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "057e6e001ab2ce7b002832692beb9581717242feff9bf0e66d2e1179c5bfcdb7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "7daa9877baa3f6d30d33a619f8ffc39ea922e7b549884d6bae2fa8a675d40f5d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "64e2de81db46ea4936ec3f36e20d86d26b8205df3b21bc28c75639073e1e6fd7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "51b93ec111961e12ef6e51edb36c3ae695bbfe2c4e8364f926868710b132909d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "970c635c1c8192c9ca8841386b1b95a93ac3456ac2e92f290948139806a58fdb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "c6ac28543443ea7293afff32fd8b54e70ddb02af02d321ae3bab5192c72ee9a0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "024dcca05b5191c38676cb674a96fd3c0d43b15bbe5a5fe71d7f39e4baba9385"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "ea95f8c5b638cc4df685ad2ad88d97fc2b3ceba9e9175b922fe8684d0b8842d8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "05ae607700ad312a6b2d52139879c9c7189436571ac5e6deac64513bc4ae24dd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "548d084487d2d6b83195efec7040346b850bda683805d3e74eab0146661e0bfc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "38e4e31a107b3e20bd4cdff92a649fc1f3dd8d610f64c7b277456abd888c0d66"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "d7dfebded9ae12278e0fa47163756660c7bd08b8c56e4d334fd4c3a1b2e0f01b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "9c0b6df16768698f28c3a9f0db67085388e006c8dbbaaeb4a08fba254965026f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "e86fb327001135cb37d69100317a8df06336a95e2cb78e49d32d5cfb1a21dc88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "fd81e264d5f9e6269f973cb96cb84c98b8493e1ad4fc2edb076bffcd057fdf57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "57aa5ac9effc4b71e14b35ba57f0064d3e57939ec89e96a3ed17a7573fd58c9d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "9e4caf9b3353d24dda5edaf0965f58b32ae24a3b6bcc66f2c7601b60886ef681"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "5850d52974268fb7c942fd8f6b06eebbba070ac0b20cce96e822bbee67d26d36"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "3e0ad50623cff5c56348c857fd7ac2eb1e2e163ee633be646daeb414f9d04a23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "8bda9621c1ff23192f9161ed6507ea3316765c65da0e9e3f90bd2d8eb0ed8472"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a869eb76fc015640cdb6ef52ed592086a5dd1b2fd35346eea2ab1aff0cdfc5d8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "640c351c650a028166e010b2f2b8823f1397812efa4b30b97f8a47a0815acd5b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "f24bd3d8935ebf0156ba3a679aaa7cfb7441e1abf86f0c90b182b65818d7f6ec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "5cc0ef7064d4243b71504c297967fa3c0e4dd0d98f147f9622da2af3a7c747c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "45c68b17a4589540f3375c99a1d6ce2dabd4a8f9ed5e28cec43bd4c13c34c56b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "1ec165b86fa691fe154a189a84e316c255dbf38a2cb9e3b70a06b69c99f24152"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "2bf04521428c93ebae3cdebae5e430176cb2b58ea76f4fa16a3a43cc96c5427d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "d68a0e02b016b07e79e80a3d583162ac9d92507667260619c68424c4e3e72cdd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "768f4616c385fa05288c620f65dcc7635c01250ed2c54f3675f3b99fd0003593"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "f66dadbb60397935a2d6fda16a678f1579593f0a10cda2b82c41aab825b666d8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "e18f6526457943f2fc30ced2694617ee6f619ff10d7da1f351abcc521861d1f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "0028961c22fd9182588a05ab31af7586fee2f9d143cd1a1ad4a765eef36101c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "fc2082aaf436c915185d42fc5cb544550fd0c9bf22cc15d31d7f9ad96ad68a68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "f7e621ff6ba7c771ebff25d6ecdfd8210b8de6330357d722676f844ead15327d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "766996e62164b7c080de4fd6996e6c623c3117cc08945950d97c6d8829731ccd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "4efab4dd59bd4001cb6e6a7f3928000beaab2662d88ef9d02fed2a27f7a8525b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "8a022eeaaa08bc5f8c53ff41f0f2345419616881681602c492c57fb3faad18f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "57948fae4e1bf88aff0962dc3e6ce48af9e3f98e1cf33c41d9bab44d083b1616"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "3500da9a3ed2f9b67baecf08bdc85a8215eeff63b842beabd548a1dbb9becc31"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "6ff78f1484fd60c8e119d5772041d78679bdab23659cb0b2d29827ef156dfccf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "f4192910102b55d904069aa27ef9f1716efd1ff1fe633afb085f1955c22f1cd5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "9aaff75c8ef227802ff983c586865c42a2b1249a52e5cb299ba3a39a71d5f5d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "bcb95d409b4706952d4b1bcaf73da9a9b603b4020bdfd0fad7da735128c07faf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "a1996b4ec2b85d11282c0eb4d15312869648791b6ccfb271ff0d2ec1d6a641af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "f8f32e71f17d3f40b7d7e1433915d34996ced8e358fe62f7746baad56aeeaaaf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "1cba198fc92c8f9b62ba7caaf5db26fa2faecf2787cc1bab8809e19112d59a8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "e855b1ee23b764a054b9bedd1805e98c52c029caed169b818da467ea3e5cbcbc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "91cd0c2bf6bdaf4acf9b695d7a59befc76a89b0695f39c32038508247f9af16f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "c185ecdab5c133660931eb573775706d4f3c931055e37f9f5f97a775f8c26bbd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "d8983b7cd3524e57d7f9964ffb1171ee7090bb005830aebed973399c5e876fa2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "7324ac7f27f909d6b3da368ef6d7183827a7b8ab7755352580456da952a5bf36"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "c65cf58df652364dc9456c16bdcdaa49c5141d2c3591d8f35340f49669924b5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "d2be1ad3c5c5b2b28c1138f01dc5491526f30884cd5fda2cc60f6791ef98f4bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "de775b413a6bffc8e202f99376ab75d9ee453e8bf8c048a8e6916f7d676ec869"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "ac48190844ea9ecfea5d50b585e1c824c634c8a23b93f544b87f6f51b96077be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "a49829588c4ff8a0af8f4b21f0b793a06652c40e67a687b06f29b6b4798e0074"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "61cce992dfe23d624b786bce6abaffd0d50567dd706439770e8c3cb25e4a8470"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "c4c69042ae0da50370e3409cc6ce98cd55664492098dabb12c34144136242ee8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "b06074d4e8f88a448ef0de2383c9483315e8699c043c571f04ef65ace414cea7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "305b2b2cbc4673e2ae144bf60b4774a9fafffb26d8b7188123cbb87452cc2cbe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "580364fcb8de0a50aaf63d6973ffb410002009032c5c2acdd99d5030b9e3baa1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "8d7a63923f9ce5c6679385d4557728b711e9c70c9dc9b135600aca78e32e1716"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "9a34125d39a8236c4e38b42fa7a037c620ea24e52a80c07dd14d658d685bc9e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "68af8d91c5039658626eaa512c7febccaeb5f530a76f2c250d4769c16ee5477e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "fd59d419cd10e8457d9db5f4096feaebcdad99c94b7c54561298e4253feec341"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "7e3624bb50bd2a1f234f7bf8e9ab7b022d6762ed774cbfba47e055285d0f63e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "72d7c6b994c5da9e823c835bb1c5fcf41fccf0284219430c6724fe46f89f838b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "4f4cb5f277327da0409911419d35412e8b00ca1b386e68505bcb60896049d716"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "8bd4d7736c1d2d63c38acccd9663589b633bd1985161b81c77e29786bfd971a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "3168a502b3de28a40a257401b4ea1a631c15a9222702192683006671a57f79f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "7a0dceb04653792fb2dfe42a67dea212414bef94d7f0020d643620e146ae42a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "b3b98677423d723069e30c67d8d53f65501b6a76309c8dd998eb6203e16c55ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "cecdd21114538e9b4999b1fde74dbdd5f97c63ae82f1c2ec2b0bc09ca1e31105"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "eddf16763a0b71f7136b3ce857e87427a5ecdc5a55e6b8b854d981cd02598712"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "5dd945f7a8d954d0255d54ab109bd3e675e4a531454f61191b5dd377c3d311fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "96d109574da8b662f221677c9c79ff61512b0df9f1356abf9755a1468eca8507"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "8979ca0365b8252df63e232a80939fb372b70f5fd55f9f25a70c7aefe2d38a07"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "b298743a58a09e13074988274fa2d49f5b58fcddbeba4e764571409b32cd988f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "55546e440e2b190d78d2d7ea7f8ce33652d0a51c410d9546aae05419e1cb5930"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "3cbf2a3fc44ab92c53d959ca98ecd35c34e0e1becb90a5de5e07729ed8763c28"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "85a395d54235678965c767c8681c98fa59d7719f6ae757a374b48d1b0c745bb2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "5f79ec634daa6332f076038ac3a8699f9cf492949115c309a5e44747b35c2a36"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "5f0681376c66d567ae0c5d8244f2ee97a45b9f3c2ff3acd27f3c2b1d1c91d24b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "7b368898f39e3c5883593197c6630b23ca9c51d0b53fe3a3f497b1c578e44cf6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "e879bc7a95c8ab4aa89e558dbf32eb83efe85f55fd0651b15f451ecafbb13139"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6c08c093739d9d3b2a9b8e11bf54909e48b69d821509708be45ffb1406faf0b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "6d7d6dff2cff2c74709d55342f4a62617643a4b039e2ae17a46cdbb444ab583d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "49174ae398e81d0f37f4a8132f5d1a02c0fe2c56f68eed8fadb106dec8a38557"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "9fcdf81b688241df3a37a1e06973449ec486448468255b4ca11c9b28ad71b33b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "9194643bb3801aa297b3274b6210429f3f759d19cd055e7518cd500669ee0778"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "32145a94cf262f851d1bc3cea8332b820cd1f84cd01197349e2f587e9dac371f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "4cb583212a2d156c187a1b30f5a6e996439b566a41337b21775192fb0fa54f02"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "52f7e3e915a854e90a987b98648f21d2c4eb0d3c641abbe46f65a177ae1beab0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "3e2d6a17a58c5c7a11f584736b2513edcc66426ecfc51c8560d974bae95f3ca3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "f8dd5d04ed444aa0e8c2a68ca3ab5e3264df2e807fc629f1344a7ddcd79f8d4f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "bb39559469781a48e2e4cbe1434e708352a55a22d707b5d071993218397082b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "e80d416c0c031bba829dd38bc46fe2c11e89c473515bf96bd3a419286f218771"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "2e426e0fbb87bac16d4d187604895f2e2a38bfc8bceeac3d3312ee0fde1d59c7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e9e91b3d795884fc23e8df4450f4dd1d3dfa30edf136250adf71470c65c5ac7b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "50d10365e16428749bd45af0c229ff54455c8cc4ce84e343038c2a52be851315"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "c4a20135a5f4f0e969633c2b6e680b4dd24a13e80ad3b1fa1f59a721a7f49026"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "be77cd7703a5105a5172b2afd21bd3d1ecd19dcc232e1d58f387514be780cc27"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "5755e5ba7d56082a233a93deef4f0aceb3ae6f3547dac26440fab041f63fe60c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "50cc97a34500b8288840ee61634ce2f0152aeb3e3ab156469d53ca3a6d0e4a8e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "2dcc7ad9702acfcde82a9dfab308fb0e76340e2cfe804b4af0ce156ba9c7d7a3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "9cea34a04d6059cb332bbc2e24535ffe0bce4abe8577a88c6c47e209ad561ddf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "7b43ca9aa3fafc851ba96992720285c569e2ca5ea053d5397771d1c019f3e5b2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "3851eeb53e64a932339fc017d28bb2c24738cf0289fd7f45d4f84a6e4a997d54"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "0bb1f0cc855408c692fdffa133db176a783dc48e31f07f2b1670f4517d4fd71d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "2e7c5e250107537b3c1efeeb1f7b99b4673185e8bb1769a47f06da41c24f198b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c0e8467b0c8e6ea8eb63afda171c8846a3493cc65cf23288a12a36c243572b10"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "92c2dd4c4b311d679781f4b86b971846171d518c4e41f6b498db96566b5cb5f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "8dd72dc93a716963508b8cfdc0bc05ec87e5af068c35260061b1980d3a0a0a05"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "5479e9536c902ad7f7f203163181debbffd581d0d511bc610bf57ce0334ad470"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "f9277b1f1dd44eaf1171048373dc98f430f04ed1f53d17b5e9ca087ec075ce6f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "9095978c53e416b019ee2fd5eb19f6a78cf91f8f727ee1cbbc7101692e852185"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "ef81058a93723f4c910f59fcdba7a1818dcd8e2259bb70f222be127abcd992ac"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "7bc7db17bd1c4639ec10bdb770eb90223b7dcb4f811f5484672c68cbb34c061f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "88a23a11bd865eb2fb871780bb8d9f89fab93cd53f974bb4e1875a8b543c6742"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "8ed0dcb6b650553bc7616ae5700e1258a2b7106e6d5a4f104b7625bd38c52471"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "58bd180b2fdfa6e7cf5f5125db636af92974e133fefd4297f1124f5f04d2f001"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "d5d3265b2ca043e99ffda31e33d7c3cb37a48a6ccbeea1da0c47ccd3b53e83a6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "9280b6b16f623a46c8f895ef022a08d084e3ff5ef237806d0d4903f7625fbe74"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "61ffbe1dcace7e76efedbe1fa37938371677f6c227d31adae72adbb78328ac9f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "455c0ba452172c87a10dbe29e851c184b88b705ce3ee3c30f125b6a23985736e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "e459204384d8b7a8216c6840f9c82cc370b948901da967408fac0131f282d8f6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "9a14bdac83ecc92701e8ae86aebe51fa34c28dec7589f1b089e21aba749c31a1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "9b68068950cc8e00c11407eece628b689eb6c576515789c2a35d40d8cb7849cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "a8d0ee6be1323762d662e678ef76eae9316df74028687acf22f1889539913a49"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "2db89676426544d8cf3f25da02d44773e8ebbefc723318220cd9c612c2806f54"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "908a97c750c404b737bca8f9f43a4fa032982cd5e6afcaaff06a2c254cba048a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "6ddd1eb0a6ed5ded94a3dccfb3e7238a2e7445931e083c143b4e5829fba69c50"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "f5658c6375bf3dae123d8f17c12c25f777ac9ce5e18b6643b60bb3ad4252495b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6c62c59c7f7e9cd5b94db1768929652f9e5435a37cbd8475fb3870efc22c2a86"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "1e56c4ee8e4687f73b32a69849873f551e84994d61b58d0d062f34db6dda6e22"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "156fad272b42842b138c80deebec9ad571ddf56a611948f9940f00f1b6b6079f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "393bfc6aedce2b00a55f7571906a34bf30c2d2f41e074b32c2570a3c0d08bba3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "d833f60ad45aa56e1fe4b8aac7b30c1d57c33be59cfda8c1ed35f4ff50add44e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "4b89b02845494fc4fcb569df9f91306f03b1a98f40ad5885375bb2d7db43d1b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "fefbdfea7a70e47d89ae85527bc1aa7e970eb7039b495f60d3496d3fb3a65002"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "9652db3d622e050d61b81aea2e02f026ea0a336fac0d332baa23c411136246f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "0f5cd79f05f606d96937be88f1a6d914a97df2be0baf3222bed7b04f0632cae5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "be33be26da0cf6a117a1dbb331dab37444d2c8ed64d8ec23c2bf0e1c539edee5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "d02f9443bde716f251c79c94646ab5d0c07d0b383cf2144bf2a867ae8c4ff685"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "a97e1e73762e95290bdc06c280a0598eb191203705c97508b186b3c86f5c47a7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "05b3f0cb8aa0ca8c01dce224ee99e1dc9c67c2c9d291629b7147059abfebe155"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "37103e126823475781e10d3f9ce89c75eaaed4e66ac2167fde94d23da7f7b095"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "398fdaabd95b0a731f4d528571a3c9d82cf2130097213cf4d8963b5b9e2ff421"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "2fe86a385e047f192ee0e51702c106cf2433cae75372638fe3a905235e3c31ea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "9aa8eb616d14f349639ebd66438ebf2331e12852cefb62f6553af7418d461670"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "0ab0bacbba32b61094662ab57e572f6c46a97776174709189c2098d7aa4d6e6b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "720d3f1ea9f41a8fc66df628683b43f10a4797e5a902c0943e6cb2524df93d1d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "bdb4720a4cfb0b28f05df37b2e62b2e4bdd17c95ab267a72739b76234096e7b9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "2f2cc6d337e67913a837f4ffd4316038fa04790802cf7b9ec9c911504b1e4c5b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "952a659ea27d4bd51c88af5052c3694f63166a2e8e91e4b734e300383008e97b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "d908da8e3f6091a3800df8c3fbf5b9039b17247d283a74b40ad52c2bc73bc3d2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "73c7af5fcb9ab3d3600e974fcb74b751a3a120a6432c5f99641fb3d2e73ad35f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6958c6e389d91d55bfd575bfdc8efa9aa1ace677c32a26c6b62432761e3e331f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "74b6e6785267f44d80e2bd44a6c79ddf7142a4e6e9a8e76811c7548355448748"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "891be1d62e3784e731e874ba38b73ff7abfb09a4dfc4e6d49ce58a2fc68c61c5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "49a47c557e99073c586f90131e3417a037e43f7bcf15d1d4f1df517e59925b80"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "75128fe47df03d9ed82415785f9a0a9851fa5d5d23832b88d23c9a91ee75edc0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "7f6cff45e10b0eb54aa918201a9187adbbeb9c0ceb3724309198c6d16c25d52d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "b8626f66319ec12466d6f1fac3615baf094999e20b66a00c889f017676292e1a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "d40dbb58f15eb798caa800b436d5bb71952c319bcd8988adc729afc5ace572bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "3b2ad3d0d84130a29cb92b8546cf064acec752c898f2bc8242c6b080ecf48095"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "488c88f44db777412ebf12e0804cfa98cfe7b7f5fc9d06d88e73e4e292ccffbf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "db3f2268befec853837daf3be93cf6a82272c991552aa751ef17928dcb055955"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "2ff864e1cd81c494a86238eb130876f31b15a3a225d0c326ad822572c0592577"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "c9041e1e106cdb5842c57d299da66e68ab02601552dc7514833d24b3afecc4c6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "db83ee691d475fa587a18dd9449ea2d6e7ac3d013df00fb3848f94f7cf75c5c2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "9ba0c5ed4ba2d94f85900efaa5d3658b1611517d3e6c435fbf053a3ab6587b43"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b160465f34745ca4eee91954d9867d75c7d48a3b4f46690cf83ac6e86c8fc7be"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "43f37f6b75d63a4a41cc016fe9e7f96c79f6de725ac2aa71b487de8e7fe4a8f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "afa13529bd2826e2e5c38f2cf7c4c98550b426836fe092c92226f7d0e110cac6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "7409f8570737b22ff4996208a52f3d30468dd3f75efee79a0d00dd5930adf8cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "109dc90aad36301f925baa212614f20fc8df36590a961e7b35a90ab53ba3258a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "2c21b3a1ecf735aba7479b8bd4f4c4996aa3cc1321666a355c18369b2982e51d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "89fe3360483a7675c52fb98b89e31101cb77203eb615aaccb755946165b0841e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "d2f8f9c494643622b2fe5b7c29d651394579196d320bc6273dbac99037c878c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "3c72c14700167f1435f08ed6495ce158e7619549696364c7142d2c83eb3ed6a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "1731743cf81c0d3f024ece9a3edaf730c0c60fbfca161b8577453b95f4f7ff1b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "2b5c99cd3fda63582b7a9deee60deb879e116bf019f7f7ae29bcadc4e789953d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "e90f5060721ca0429bbcb433dd9a8ecd09bae41b84449d92bddb2eb5b16aa1a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "aba8cb84d3c2f21cacbf29a78a5122fb36737c7cf0d50ffd893738dab3be8cff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "6f5f3467f769947e80ba90d318ec8d3b6ae4b0ad702a01c5c3fdf6d62e0804d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "9c256548c6c2cb0d353634af77c985568fe67cf2989bef6a0a79998d2016401a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "9ef4fe04d692459e24102a039af92ab7e5ccbe4c8f757a430170c99d47272f4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "1debee0c392b6c5a1bbb658d4c4feef8b2ec26b052321c0dbc61eecd6628a883"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "2d5108364aee49fe9053dbd218176c115fd34ed4e294fdbcae59ffcd6eb33cec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "2f24dca7f26452f3d97671ab5b05947ecbd23fd19d933c6642e519f857a1f433"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "cb028842edd317decb791c8a5f102ca64baa9b1b2d38ecff4933986cf66411f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "15162b80b56e090a4e98dfa10f2a33729140dd146d0429bd4464c2eceec1e070"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "5d794bab706ee15feaaa6632b9984c7986e314a5252ed7dd7fa4b97b613efcb1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "4a64e862b01cf49e0130aed6937a203901539465c098706f65926aacbac2da61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "3377efdc8d84d391ae86e0995836de96ea320b0ac1cd06c5f8d974264befeca8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "97535052ee3c5042229ddd09e8bf202d3ca1d521d53a0a84636be15f6bb5dfbd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "aea8477464b717c885d019e4dea3b2edf7955b77ad9379754a06810ee47f1695"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "947d29729d4e727115cd852c11148901be1147bae5e2a66cb1d8de681612b8d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "6ca45dbbd886a44bdb8cce8cd6bfa7a86c0fcbc1d51bdff9f7b4d4d056dce272"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "8642a63cd261052b857b736ece72e10cd9785be9461d74297473f92428438691"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "8da9599443df2221c208f374de99b1e0dc9baf77f9d3e840ff4cbadc9d9c759a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "8174f7cb46c6880e7c0f13f0295c1e2e016a02d7989dd35968b3b60a01fd2954"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "8fb70620c4bd9e13f352324c8dab398d80a5843267151b23e8c2f75ca05ffc85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b5c483edd75aaac50b0ffb5db923d548c11dfe87bba5ac86000dc2047518431d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "7a5a2aec49e7a762ef3477f587dc9e61c6248b82a41a7335c1218b11807559f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "b912abc0fd8d846b2743ab098a83134a07b570a69bc29bd0cfed779fc622cb38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "8875dc825f26aeac5a804b3d904c6701dd95c1b71a33b2083ca17e4c6641c1c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "d6a9d5c84d3e967bbc28cf22ee64b2e26c437b2424f34f35bad18295b18ddf70"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "f306b22e543b7f86c0e6ae8567c3514a99bf2ad142f6dcb03a1cd0cf60010cda"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "edd6a2585b75605f343679ff30f6594bb610468dfd693091f73e99f04ba5c19a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "9b3688cdaa4b285c15eda8c1c6cb57f697062acc9472f98f41bcf7da37cb03c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "fb65048956540e6c97a8d69e3a5321d1db0cd56dff37acc4de5ee42d9a48e8eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "3d9a36b27931e2312ccbd720da6e3b69bac025de0d15021e10316f9b64b439d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ef5ebd80c95fa6156e234302ba33001545e5548fd4a703b5b8439a6830198885"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "b2f2f491d54bb1589f2640f23f2d4474c533fff025a693911d486d8eb893760a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "43910c112895eee4e99ed7f83284e1b1e103700ea8924cfa505ecbb5aaab6109"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "a1831284ecd5b68aabf3ccc7b5a80d3e72c5e26a96cdedb4007322f4b72f4586"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "36d5703bbf87311abcb6e401117f88e6df553273fae381b0f86359f45a7cb0b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "c92c7c0abd3cc40af9633cdb807e769c58f4515c1149db5d06ebb1abb6df1fe4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "3b928f4f2c1728be4cff69a731bed68d239713738d7f67212f1cefd9cdb8e1b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "0b63b29a77e4701e4ca23a5608b14ad24e86ffff1d92256ea7edeb16b6781d2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "ba60ad81652359df24ec4c2d0db4baee57a8db33d7e24407db3096c7d6c4c3ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "3fe207ba4ceb312cce1cfda31d47fc6abe172a9c415bc27a5ff0126b43cae57f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "dd5e5b5c520c78b6f01853d28467c60a846dc4fffe52cad2b9cf0cadcb624af0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "44ea2d82cc54bc8176998c83968dda856f9b2d0e0bdcd17b0b335c3c1becd822"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "82e03afffb3db96aa27f3a2e095ccc341af6f96650ab6a7bb3da76752c2d7ef8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "98aa37e91a39f47d60793e3a2023e2244b828dc07df3db38a0b1d6fde3bdf5cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "ee3f1aa9078874c08fc37d33953b61d3fe55d6ec6877942a44e5ed86e8991a20"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "2870c900aaa69d7e0428638af26eaa645cfa4e786c7ebb1864317dacc8fa19fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "e9154046da49c4aedc4dcb25056db6da0bf256859852c4f7cc0431ebc3af1e9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "8d1af3a718bf9c5ae83ffccd5ea2310d15a79bfc4a0ddb43691a35320b1331b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "a0d57af70abf01b352bd12af3c4be0512d1c399682dd4c9952226a1c058db38a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "6a60cdaa31caf571d921aee61fefc676e0ac3ea3b9da5aaddc2bb5840703299a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "f0b6aae39a26e178c128fdbdd68934f4377b5dd1ccdc9a048cf1ab0217fa2aed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "e177ce06e65e67f1e18cef7ca04c014da7e8c2de99e0b1e71964bb78ceffb90e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "ffd40e6f845a4c90dfb2e457095fab0f864a17528a4cfad709ca44a8bb8ef879"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "1964d8a0f6cf2b554d801af16afb5e51e06303aa6a6bed29b293d9198d1fca97"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "7fa98eb24e9337fa553799070e1d8eeaef8d8623ba175e8f4b6d586d8d8be5bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "f64b8628d12d085af99dd34ff07236b02e41ee59a23cb54062e1335791aa9125"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "3c0fa5549b741672aa73ec6afd820c8cf07d634e9eb18e761b63a37f707dd4a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "0a8ed98aad627cda1ecdd7854ae9de5d9211959c7c2f350622c105a4976ae257"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "ba4a063c3639b107addfc28f0fdbd34d585779d6fd791b52c71be5e181cb3c39"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "480acfd63f5366caa7d9a9a7c1f413a512ab5b18e717c6443b358f87a74e9092"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "f591ca7c1054fcbf6a6e537ea084c92c759da8e43052da0bbf23186b06a502c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "c3a90d5d18f7b11e31deae4546388b4cb19bb1b339f5295a8af099ab13c88c82"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "add11ca06b1c225ccfca0f179abf195ac3de82957c2de0409209ab77e3ed6f6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "2f8bdd698a4d79aaaf32aafd7d882203e8551aaa694bacde74c2ee5b99da9bbf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "fee81501c7b956464650c6fd9bc5b9d5877a8883a04f105e2fa9acc55c321d0f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "3dda0bd236db1c9efd02f99dfc61abedfeb0004553925cdadff2455feea4a44d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "9929d5e13b311b58026e16906c6ea98cb3c37acf90840fb7a917f63534060ac1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "a282c9e8c14f72385ca4b0355467a2b9e92f70aa04b7b91c07887290047c4b06"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "41c1f9df84a51b24d8431725f6dade7b534d1f306c0a3012854d32121d9759be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "bb272191ff1c4be5536277f964680667fefd978cc6cc30bcd2ae3d2728af82c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "d095e0f859e73c3995b536c109988b0c477294dd6fd754732439efeccfd40aa0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "b48c78a57760cac42ce280861fc47ad6867a3ad1fc00c5e48a57b10b20ee9cc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "86d557ca128a071e64af0f534d32758957659a9fb4f424075e0a246f49ac0df0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "2d0571409bbbbf593b26eac1ffb722b4f84a29af20e941cc6e8079a3f18d3aed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "f2f52aee5b13bdbe73d4733258d24109785bd636ee47e116512ac07d38b590f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "1805950a8cf3cc7ebd958f67bb9844f1c7b94e7f5e43641ce3de66fab631fc03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "262216593f1bee81c8771321066117f304063d41af0c479fa5ba237fa4ddf29d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "33be2c918375e7cfa960202c8a257d41ab54e7c65ca1c2e75e5df9135f6e39c9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "d7282dd112d5cbf16a5d324b5fd6e2a7622cf92556a41e52f1a19dd5ec955c2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "4be85eb566f1fe0fe7574228f58ed7f8871f33b0c5006880c7fd1e862facdff4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ee4db783b94bb8f50434130ffe92348f947b8ed07cc3f07ee6592959e227b091"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "110878d72f63cdd78df45ec33a3e3840e2942753664f566e01725d67a6613e08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "b32fde1da4244f5058ab22b6a117f8300c974bed5074bdaa1e5e9f87000f8cbf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "97b66a8a6f6f41b4cd00bbe0e1dd8ce10156d3266f0e8640a51f0584510bc799"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "d02114ff0257c7c3447eab4143ab2c23dfa1e7677e66476b1d0a4340d37b7e8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "8fc3ee09072fed5cbd748d58c1f9c4336daec10db0033e04bde3fc94af8089b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "f84d325ced08aa571636060a233d6d44478d7fbf7ab3f8ec2b975effeae5dd6d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "6317b56c14a27cdcd0fb29d158c51fe990ebeb4483c9420bebc80bfeb9c6336e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "07f8fb50048a66a25d45b418c0c316397c83a48ddb6e874766319116bfbf2530"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "65ded6b7ee028299706950215dc8b4812fc786a84139439ad3d5ab67837fe42d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "52ec1b1b259cb57615902a04bf80e4d3f88d56f1fa699c0b4219d0fdc3b8f43f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "9fe86df5af71de6d033b9cdbae354d2aa8eae13dbf61fe1dbbbd5c6c828fa766"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "c23174632b5a4adf8658eb5b919f2b6ba6ca80a9283bf131d36bbf72d83b45e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "a7c6b4bbe38357b3778a2214fb6db8fdb01ff0955a903e1ccc9c7596d55335e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "2c1870a4c8799fc6556820561ab1cea2783df6accf587689b499a4ad7324c40f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "e291c1645c7e2ec34a991eea9d39a8e2cf1c62f2e773467847bc9fd2698ccf5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "eec0160ef39e019a13778cc9e5f9a5a379767cd961c0ee59b64923a05dcfc9d8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "092dfe4f265da7331b0724928672b454b49b0e40c23d08fda0812b1565f74c66"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "14f7b5d5aeba9a1056d368a9eba010cbb50669fd95c18b2524073ab84d80d358"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "80f051e0a6ce5a08b3df57a0356beefaeb00053129b0643f1a51528f90633ca4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "777db01b07d5ecbecd0f9fa489b7a4e0428234f28708dab7e77358da70770ac6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "e649fd8485c0a7a1a40da2677bf0a429626c3355cc059ec9781188d52e4564a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ef888e9d4bc4db871ff2c1571824bcef9972ca8083be8fa4b1bd0602513af070"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "a5126e6146a885dd030e9f31c4cbff06c8e2f7f6383a17558acf7e9e695bec63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "1a8a6d5a09f94d972da2913075529b3d1ed60cc9f7ffdddf1d117656e47d1758"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "2bad2fd7e4bb1a6e6dfebe3fc3f05be19a56d6675fc722e976e58be17cffe81d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e2fc34996aca231d0c56443d267c9f9dd14a78160162dd85f8891ac50e6a227b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "bc0a85c7df525db2f25d1589b9ee6996d42f2065ae57fe4145acc37cfd6d8651"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "1bfc7d44c873b5486083fd203046e00078d1ebd34b92833f4dae051bef3f54d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "cf8e34e86e006f1655a4032a289a9d122efae776991650610504340d1be8db5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "850a273070d8ce128089294a42f44e58f6a0635b1f8924529e6c7fafc91d6f68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "50c9bedd367db1739a2cb51489b14e53025466da9367fd1bdda069bcf0f8a4af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "3bcd23bb152282f50f77d7b5672958a539c6a4e6804219d3065e2e3d7e0e2be1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "6b6d11e95318fd83fce22a145c5b46f2742f780895b7c701b749b7b09541c334"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ecd32a6ff054be967e861a1d21be345f8e10c5ffbf49a5c62cdd4ae1bf8e1065"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "6b2b5d1ae670eb346cd305450494393f5f0b530af97f23000d2aba33340c68e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "586c0add029bafe6bb98e5431701dd0978f0b1dec9e0e8d299cd9fc57ba9b2f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "0a15ad7c30917080ca312f4d7c13f6959cb0d0f4e18dacb1e4171de549f8d0cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "89622898e71a7f1d0259594a88249da9c6b37c3cdeb4c38e911decdfb44f299e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "3e9e64848f1a1ae16c81dcebbf72daec424c3c0ea71155a45f4eaec08d31794e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "d715944285cbc6066b1812eaed7f6c7227ae38cdd4eabb7db25f2cd477f6bef6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "4ac2185564c8747a3ae4634e3292fff4d0a6c2ecbb8cb3fefd4771ea627d4d2e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "457467f51718f7d4746a826fd28aec89b51e76126dd749c8ec34a99b83dbb036"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "399aef93072540c236f7b5819935fc9849082762c59eb0f6f876a8f771a3a981"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "44b8c497217309f148ef9a4f883ddb0589040da9b6479edc0aa15eb81cc04f43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "fb1d9042179226638a565582ef054d7d6b30e836ab84739c2168adec4e5d58b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "b2bd7168d997da92531b27437e65b5ad08bb216a7d81c8d4c1cbfad0bf0fa419"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e5f94d74378b6b3da44acf6b4182c9cd0de3c64a0a0b7612375f4aaad27fbf0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "c91b68c5af215587713506286a3c615f892a8dadef74ddeeb01f9225dc1206cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "5b9ce262d4ceb7a6f1e2f92d177308ffc38ea26939901a8b304efcbc5379842d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "d1dacfad123a91d29a4f17060fa014aff2490517d3250390fa79dadfbc8be113"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "0c417615368d6d16558d894475e084e22c7c77e6ae064e8396ea80beee8cedfe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "617dcd5ebfa62de3130607e1b9ccae12bd92777a602c5d050410a82f4c8f97ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "63009ae25027f5fca1a6ea70bef81487914f93dc8e06090260f202d07b82a57f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "eba081eea30766d1292700681970d32b80d0fa4b26c9a0e58a6ce66543c73322"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "95c1ed0f0dbec3be78ef0996c31243355bf67a752149d59f735abee9f193ea04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "c53627cc057ffe46efbaeead14302d3826a691f468817f057734d5d6656b6eaa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "629f305a1298ed68ba933d547f8e1b94a1727d3f8f0392af3da1bbcf1b33be29"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "6fddfac6cf592e3cfbbfd57989dd80f08e404bfc436f9a88c94d39b5d3904274"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "31edcb763c33eb06aaa181eb7522f117729de612c73a091e1d6ab98bfde8fcd1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "200cf1744c638c8b4ac6ef160cfb76c59a3e8249159626022c1b5a836fe5feac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "048874ea0d51efe08cbaa380e432e239693c3cfacb81e9c89d9681f37264abc9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "e8767d83c6f2226b26e5241e582073d8e83da58c381ed4f006c150934415fd8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "cc868b042d5788fedc27316b7446b4eabc93a23228024688240d11f099c7aa07"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "3e3fda8bc1f066e1b471e7f352c7acb5a00c40de030318e0e7fde7dd750a51b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "cbe3c10725dbc61d65b913bf46b5ed5982bad39cc08b6e3c67ec2530100ffbd8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "027a8388e45d4dc99cbc897dc411c7b77bb172ea84998f3007b115a99ae9b62a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "95f8e7339b314826d70ecbb46452436c3b29ca9187074bbdad2d61396d05d191"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "20f0c91df62161d97993d4047a6186aee28a1cc85877077db82a4f916019edac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "f4ba5e7c38ec0cf4e383bf3a387a46fec8f7e578c32fbdbfb2e3c250c54ee172"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "c666c60ed42b858d7538941109565c6ec62c578a29018ca086506cd8d6f9d08f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "f1d57eac16f17ddb93f7b627a5af288e0a096b0341bc07d67cf464736aa54945"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "707b4c37e0d2553646b9e01207dfe765d9d067f4cdcf951818c3a1f1bd18015f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "34cd1d1435e54983adf091d16f0a278c39b0db489e5c60cd9846e200fddc359b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "921c237a5c883ddaf82a6caac888d185f851a6e7cce3681d2c35e15a0bd4a4da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "60dae37b4a13056ff5071531578d19f432a059c8dbc69b6aedbab0c5c4918016"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "61efbb424eff9375024935121038161b586203d3b8b5eecb5671637e90c2d584"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "3c30b252c50c483c477da123e18d8bf036931f44608735edfb52002884b01f40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "0860178274e550264990e4f457da2cfd09e47c55e30efa84a7379f5277bcb485"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "c38e83502489922cf9ffea04cba0ffc631f3eb077f2c686cdc20740c08369ebd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "87dc769a3d623ac922e9460d2c0ceb02cb6f8cf45da50c7d4540be4ff7eadae6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "289feac6949fb9986501feb2af0c7688dbcab2cf14c3dc2c182eaad8d46b5559"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "faed849873af565c6cfd22cbb2247a5ca00223601d36a2ed186909fa82dbb3a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "5be5fca8dc01d2cf2db7a1cc7065d971b79ff22a180757a8a28dfc311c2d21fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "4d8e86d234f5792305cc19aeada95c97418cd970ea939579ef28384d15195642"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "b35f8b76a45400eb1b953f67221a57d1fa6b6a1db5414ac019fe5f64958d82a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "304204b76c2220cdb2ed4924e55e7c16ecd78b7e64b95f2c5ac35df30ba26684"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "c8ade6e7cc361fd5c246ab4e12911ff3e4dbb4758f4957d56e357da0331a572a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "39dc1d0385dbdbe87539726e53b72ef789acb2dbd7d593416c62298d672a8d25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "63a0e8832fac208ac2799a812d494942caf7a41ac43257a4668477986670b6ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "7a65a03794051caccbee18832eff98105403e53cc7bf7ba3b1dcdaa81654c52b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "48dfc4a47ab5881a119f2855d2fa171b232e98628b3a30c6d7900b46ca72fc4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "b43e244271587f3b218b2c4bdf21949259ae9ac09defb8fa2245bb4207eb1ae3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "ad1e4da9a3b477b3ca4cdb3dbfd081a4f1ad66e1c62a9cfffa7774145dc11534"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "8b9f47a85f25cb21f1b52fd253f1d809a34bfa03223c451c5c318c6e2ec50f99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "332d0eed5725a4b503ae4419098aeed1e150f8f0c0948fa3661f89acc3ae6684"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "adf2b73238d7da9bccb2b542c718b68ca1cec1f306a2ee0f250aa7efe036dd1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "f174618947304ad2c094c462806b3242b9942d61e34b4656947d112e7d030e43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "f277648305b1fb7b714a3fc381fc8be9d2544be4f750cca6cca7e53f41e9d6b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "73bc641f6920a3db298843ac5a346368f4e9652fca8be32e66053d2eaa8d832a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "b8bbfe11b3dd2b44678b9d411c3eb21724664b90d1b2f8ceb18ea098dc8760eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "1adf10de49ec3a9bc3a174baac097d07d3682023538ad06cb6becd3fdb318dc9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "698b18d0e86a3e92732402fa74f47cf9c43f4444432dd0bb3166aae8a25e9ec0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "7781f36f651c5408ffa89de39d8f794627e378a79aad87eaa7df1bee4de31bfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "b7a46333aaf6e417d1bde59766f6a39fd015519454fe109f01d6a19f38de34be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "b24f264fc269f1cb6254bdb2fe073539171134847c61054cb1888381ad878aaf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "b556d8cbbf6b2a3543203f3e8d78b9c334e3a707cc8f5d45f5aca7fd8d0ffa58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "0308eb88a129669650aef626878254392794b7227a8c5fa58a3126b67820a309"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "d02f93cdfc552f71b1cba5a9b8f70b470b5e795fb33b0c6fc14a2794d2329f72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6085b8985ec5bb1d33a78068728f28f65cf1b0227bbd0d8895c7a9e021496470"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "66d436e0ff7ac906df6ce8c2c84ed1a04f4bb88e82f7ea6c8c595854d0bee12f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "77940f13c89f018977640a5c0936395796e0e072f569e955f4c46876beb258b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e183380c6fe75e6efb82d6b9030f1a2e769559cbe6d8829dc9bf38fb7e6add25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "3a103ea4edaec506b16b607eafe2f769b6fdf06cf64a96b5cd7da9665222b9b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "d888228ee902d2d051d05806c2e23af5d374a156478939786914786ba71c2024"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "88eb7d0229507de84ec368abd88c45b10877107b325faadb3efbd1536b379f3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d689402477f54b1b839e7a409bf1df280268ba179270872ca6fa0bffdcc1961d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "a20afdf26e795f87ea346abacafd872c6fe6fc2fb683e8b1dce44d95c9c41e30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e94182cf59361c482d72fe07e713d0e4cb0bb7dc81c77b2cf4933545626667e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "3db2fca1dafae8894cb6fc2d4be6b525d19043485e602300b72a80ff3099c6d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "9ae68ac9deea531ef205c169cf4e923515bfdd6b397523d5668f3ff3fd02f93d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "aba0d3e9142cb6b7f5f6b4a58b5ba505316ea6d62aa4026c185c1fa081f5d4d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "dd98264e97215a1884a45ce6fe5ef8b2ad0f0dd5b7068b8359e5b17dace388dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "1ea541c94826beb40789216d3b869afa13b972a158370f6f8211b42343c4c1c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "92c1446ad9a640adcf436f72c7cb6ec0322932abf9b30529c042b5ec95fc6a2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "00414a591a8ea5b9e2328dd475565e9546318e0486051b3b651c9abe08a92fc4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "648784d13429741020225752bf383fd5dc1e9e04efc010f8f287a21376508298"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "d82da5d953958da40b6826cb3d5f524c90d7dd4be24230b089abea33f2afc2c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "c5bf62ae7d1562a7abc2dd6964a7b6771302401f96cd01951dc143dede0fca32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "5937ac8fc77108b54022cc3e9e8ef0c3e2b60c9387e63ce8778fd4c4e95b6290"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "c5fa72134d101905fea83e1b79475e72bdbb4078135634e48b18b5a222612638"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "8569c0a7c869f8ddef21e1804f906e7d528355884a4bf8398d48cee372ef41e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a5250183eb34151629e06ba64ae02e4be6d41a5445e76c1b688e57888aa014b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "a300a03431937b1732ff52d79f9fb71dc2202fe9ca69c2d629d408d7a5be5a3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "6d8fe6f9429b16b002375cb5ecf24ae12aaa1ba0a33e71eb95708af0d325a0b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "6a1b660759bd3e3a24138f8d74c2a2b16ce1ad9d12c7fec8b6042152dd43ecf3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "193fd81dd124cf306aa8c95ed7ecf4fbf3097bc186400b84f3a044e931faea79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "3d081936898cb9414997b4d482453c4a00e733dab554d30fac335e2a34ae1852"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "aef6001172bdb4193d61d9bb768a4148eec39ddb1c6e1b064346505314ddaddb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "107695b9c35331b216719bca74f5de70508196203bddae30a459c5ee80041661"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "92dd146a4cd08ec865fbb3a13d2b0b7f08c5298ebb6aefec41603486bcff98ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "97d306e3fffd82724c0296d53197bdab840c4d2c039f9c7509e3b59702c81913"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e7952fe561a20b58fedd381dcdb596936012f862cb50fb510cbd3cb800cd40f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "04038177522fa81a3edb232f5acaf1893b09adc9b7f62edb1f1e2e0d7581ee94"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "a95a9e0e98a3c5e331d7a36f39f81e640b37e9a0d4ccffafbecf72a22bfb1885"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "46d57a97f3c45ebb18905642c6cdf574e919a6c34d1e4768dc015127ef041f80"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "0e6ec61828d91c120da9ae507d087d5b9bf77d5d5da62ee9000c1230994f36e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "35759e882ef37a4030e6382129cf71968b4bbaf3bc5881783a8b5bbf65d86358"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "e726ca3f061dd04ebd52c3463aa44d3e7cec7cce92bed08e952ba754694c17c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "ea294dc5c3a9c87a40d2b997fafa79e695d6c207c737ceb93baa2e9e91387883"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "de9878c5214943c0daa1f5a51ae225ed6b2b2dab947c1c9a4ecb6506a4e82552"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "5b6b4ee8f7b6813f2701e2a121e0a537b676bd8bb24fe4d1b4ab5a9971d69034"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "47aedfd269002d50a85a19cb119777e43d4b883563b8950d3ce3d66deab795e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "555654df291e5da37e8de5b2a8537e83e0c45a352aa94faeed758e56a34b4ca8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ff0da5afeaf623b20f36b6e841f43cb0471051ef2be6377486efa123949ff681"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "345fb2b796b431e93c46dca7742fec35cbe4fee1f6b04101f33e1490bf4a83a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "5d8ee56e403684894759a4206f2aa6781609c82b25c6cbae76c85ddc1e02b1c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "fb6b5fffa0ad340e5d32bf4af5e18130758a46e0ab3832dc2c004d7b73458e37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "5dfcc531ee0bd38fe1235889ae11d60ef3133bdc8e9e9314c02a6066c224b86b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "00559bdcd6049d8ea20ff1a4618ffda94c0692c1549113d39a2b03cbae6ff1fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "f83733fed2dc53dd935dd049da540117b9f16f986ba3c5a530e668baf83af6ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "613fb12eb4de55a997c4dbdefd3f3bb38babdd25c3e59496e6aeeeda6bed34bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "7625eb65847ad4fcc319b35f38f51a29bb1d43f855f637b99bd6644b7d3e1dce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "fbc082736fb600a8eddee6b5e99cd890c8d07bab35ba62b52d2144c3317250e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "a22ba55295d32463740b266148f85e1d2abe846377418b824a91ac7725d4c4db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "84add08c279c040f2d5813af71d3088aa54d9fb2910f608015c0d00b45b5304b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "fd09854c9aaeab34e563737c9f738a0e8d3ba51a2719df5a83ae6344f67d64db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "eb8324f0220720d6d965c5591b5ce9a48bf81a0a4390e7afbae45c1c04422f2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ad0f304a1dc6730fc30a6074a271a42d893e2e339d66ce9ab7e7f1761661ba64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "aa2ef8cf12aad87bcc6144583991300b2ee1f310b1e2c2b119e6af8490c5448c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "eb3aeb87731a2a8039334689a23d019ac522b99d6cad32ea935795df6de5e9a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "b50f73a2110db73b0a62560302976a5c643e0638e845fd294ba6afd1943b492b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "a39d55115027f589f9f437ffa57cba9a9ffe5616ac95580b6ec1e6b8b6c84a1c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "2239e80929f68fb20d6cdd110247d6a832944933d49687deef5de377892db32c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "783e68654c2e20efe3f639af3f04f3f1af32c6a1f39b26b841330fe79a3edf85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "a47da1ced8314930807bb7ea7a55a2938799caf824db173a55876ef13f8eebbf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "65b520d551e8ef8db832bd2bbbe7037f414065f14edd7a6c71f32ad3fb47975a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "8e925d25b3f1bdcb6d69d89e7f4b2da776ac89a1792af518cecbc1228ae9977e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "953728d5e9a6fdd529b8e61852e74203c920faea4a11cdab7a32142364190107"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "f0bbe4030ec9eba095eaf8236c5df5d84bfead34a4bb479121799bc9b5ba5876"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a1220a88f6a718c8983224c731b57a3c2203d8e420e52c8539103fcde61453fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "4cdc202d2e1aac5eb6da344a89d5df2195498b1cab2fcf7dabd7ee38d668fae5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "d9cf954ffc1a402f1a00f8d70368183f02e3226262a5d8e60a9d363445b7863c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "543bd1009277ddabe3d530c0b354fcb56cd31bdc1736278d7cd739092451aa77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "ea25fc7e832659cb5b82b4010aa9c409e71054f9b5d1bf04d661294c61e79b67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "1cd6b83773f2fe4b33fd257a28d9846496370528303ed2b80392b1028d6bc8a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "430854db3b4c14cd03167a5ece71064f265099e674bc7cbf0dd1c18f1fc6c6ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "49a3f31ecaf1cd6556cb6b5c659d790b8779adcd9e742178337a9679836b9248"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "c485634c7b175dfedf565d4064633e2586923cb14f7e2bdda11e4644cc87440b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "cd14c74ff65e5090aadf48428569508ff6e5d973957b33a65dfcd5b9f2916b3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ab4728da53641223e8c191d5f524a51baebcb090497beebd0f3575a0fffbecfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "9f2c30237c6ec81888c6f03d67f0d24bd707f470806a23105b0c0129833712bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "adc101060c4dc9e9493c63c564c7b9832efb207c72772235dd707a1847e297a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "5344efe512bfa4a3b2c429120c60d0d3a5f984cbcda365954f82147ee5385b3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "9dd7c0f67dad5fa174d6d49babc17741d48f108b9afd7c42b143267fb5594ebc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "67b14cfbcdbe231e78e0101306f62d69a4fec6738bb85b91ab754a0803a6ea04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "10c7c2c7121cf2564e06ef435c67ad50dbd9c70dbd0b4cc9752415b444cc41e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "f646b80f6d9a5be96de929b3c64ba73d6a9e60b4193417e97742d6a44d0e44f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "3defa16e4f8b135b91608576e94b2d25dbc72839aaf91e50785dc0f2149c1537"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "1469ed5d91cdf5e5bb7935851415181010b6936af1c8bece6c3b9149b79f8240"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "6b28a94cdbf06e74a3121bd338af48adb4d9c84a0ef26e668c40907606ea2acb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "0e08707a536a63cf739e2e509e32599e224836fd579b33ed7edef3c222d5f2d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "fa1b5743996ee9d43f0bba89daa61aef41ac16ee8f224679b2e2e1b47c9ea42f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "3203c112e43359c2a08073c2de9b6ec42645d32807308a7c02687d20ad66aa99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b9bcbd850a68927fca788069d6d5ac77b1a7d2785e19f1f12bada8a1c330dc74"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6b7c105fc292bf3bdc65469c01bcb26cc2898fc80f8a983a9ef6e7d199854633"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "009d099550aa635fd162ff6cc3666c5d0c019824b17635633c4fc2cb46e0433d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "89be03603e424cc407e405ad4ab3210f63d99507e11dfb8a005103a1f7f6c023"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "db927607e00fb01cf9afe6378b0a9a7a55fd068fe18630fa5c0a29f5936bb4b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "f3750226da7b19a58c7fa11c65d4a096afe9486b64a411cffdd535925a665eca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "e7e254c7a70fefafe49ddb7cfdc9c18d1e8f9e551545e75f76b2e24584237181"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "467745c2913c68c2fb1993cbc6542d4a1c8791b0f74e2cace0f7c98071df71fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "204388d08c682e240491bc04e4ab89adfaeb7ea0e8e340eb62ebc25ce776ec0e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "ff4a0dd8fc437450287e51d2300042cd1d0aaf2e9191b0e1abf2650b710082d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "451ae83d772a3cb0438422bbe750e21a17d3646f98e92fc0a4960fabf67d89f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "2b20da3ed3fede2a2b60eda0e902a11d020dc2909de60604d16d80fbd5c5845f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "8ae14f6fd9ea37425fef24be276eee44b4f45b09bd9472143c6ebb16b338abfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "d11e48d3a834c2bc53a736f719dfdf4a700533a46a92e286e3a00f0d1f640047"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "8c03f046f66b4166a00ab1f6f68491618ff232a4c29761fd375880612cb04b59"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "6f421cd81052929a7b0feb8dbed2d8798ecbd4c61d761623775cd507d5583125"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "a64df9ce52630d8e5718c8c9fa77375939ab720f87cefb95a3df48c1c0167528"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "285a0f9fe490067c77884bb6df96b0a124e3abdcad7e67374a41b9a87055b9b6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "791d857f9b21a03fbe07d9cda67cd4a0da1c13d5741298a4654bf4502fb53968"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "990cc384496258b74414f4f764889e2a25deb617ac29a05d7fcdd6b68a1ba1a4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "7b3d5fb1b3a2a8b1ab0579b04eeff5e8fab568501d7d1df5efd88671b60d2484"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "933e857cde33efed69a9314a231d430190bc4f57606618ee1014168cefaa2339"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "dc137055006f5f1470b8a4b7507657a69e49d45b42836a4d20da7043772c97f0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "5505f7e2c4fb5976d8aee209834ac0da0b63557ea7058c12347b09cd99efcb8e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "9a0bee33af09eecf91bac53de5d2ce183f8cd3793bb092486b2c9a7de9e26fe0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "eeb623d07435e14a2a3f5f8ec15302ab4e6b9607b978dcb35eca7fc9445e5319"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "0c2060c6ce93ce430765fcd72bd122a0159ae20375333082fcac4f648885d2cb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "ebcb11ef821475f1094da11fabab221aa855e3d707fe174f53508a072eaef105"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "8a8b28f4600bfbe166f5dfb865e2b71d1732aaa45f98805587fe14d010f33bcd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "799cbf370edeb637d8bf6ff4c388a11be6da38754a6594913189fe0f20a93350"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "8bfcbdd4b75ffccbea85f56890d65f9432fbf48e847d650ea668e4c9fada5473"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8d4e381686a0e5091e03df186efa91be0a2734c1dd47429971209c9e47277233"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "c1ffd4fb0b1185a726d4997bdff74d3b39506944c0b3282952da16228a1533de"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "601e77ffade16ad3e07daa2f7d12ef9d044d5c345bb05c8aed3a5e78e4970af6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "24a8b291149dea4a40b9a60ff678a797ce6f37f5dedbdd9174c7faaf0484a87c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "275a81e459e23389ea3a79daf7a18911f0bf782059b03f1b49b2684bebfe6025"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "9e8fed406889b781922d7450717f726aa4c24b720f3dcd63eabd164ca6b2cd17"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "11fa072ced2213eab88f5e3d4e17623d7f1f5543cac0611ffb0d75b30850c589"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "a5f4571acfa39e734977d6e9635e4473d5eb50672240199d0be1602d96bba409"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "6aa7b6c29d9e44e73cb9ebded650a23674d9a8a2b0403a1a0656684db36f6f49"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "d4991c9d38bc91a92257ff9b4d4704f0156cd3a1495ba887b3154166c3a34d6e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "e12eee78a0e4395874fab0d31511dfbef73b155832cd02b5658e7acb1dd0a894"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "f059ad3ef1cd1aab9bb224b138555b42c20c32efb165eb2953d3fb360a301f0a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "66e98c27614610ea0ab6f6332e77d9f6d8f42a48611c99f0c3e5469e89e60699"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "a045d8dfc2b31778e7bc5e076ae820cacb866a17076b5a3676a4801d12758271"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "d6ffe437e8af1577a6f7f66ce5e15074b17934c78e79c0d8bb36dd896f2a4959"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "8afa8252a9932efb447add23ee98450a9122fb4edc5ff625dcd9afd25f19279f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "bda33c50c70a6ffa91eec5a342173ff27ebb82d6ea828b14e24206e5beb13013"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "04d24bb5e1458f09d1fe370a47f766ca738d863e2f61197c5055eb812fde32a8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "dff3d38e3a649a8e2fba7b9e445412f9de6c494341caf15454043022200a6bc0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "4eb087738f3c75e4f4d16935ddfdc920a7f059a14ca568e86f66e523648ea9d3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "96357cdd75a24329909dc91507d7dd7055c6df74ff7dd10f2a942204fdadeef2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "cbde9ac42b47773bccf789a092c634a67b0d18e462ef158cd099a104b89b1427"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "28fa5dff384c8c613333669d9cebee1f60aef3ac9df5f0423aa3d5fe0e33efd7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "633ddd2218f3bc98e814a1194d0d36917dd5c5cb1913334c4ca5935e30404536"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "52c2db1b14dfa27669351185de199d42541bc5e58c2302be36085f29e338a01a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "ef798fa76cae8c51ecf13f475b6a1ff225da588c8326081074e006a9b9dac8bd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "fe654a67b6dea1d0630b382a73ef770d1cbe7cdd89e4876668ec4dbb3b64365b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "a1c6c2245d503c449b9e34afea4f2dc8e00ce97c8a9af3f2235c1d792ccc9baf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "bab071e2fa6f4e57bf53ea07b31f59bdd2841f3937399001bd5801c982101069"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "f30b300420a5b45d434611282c728e890518eef5284114dd4208ca0abee28623"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "6f71d3d3e58aa5221ffef0124ae144742c6e50e5a3a08a5de642cf1b1af309be"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "814b7689ff58960b82e982cda43fab9cfd1392bc956d51abde47337db4fba884"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "5917fffafd9b2fc0bc3d311e41cd38525c9c425f574c5c7faca5598c4b82b00f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "94e1d4cd09ebc56772188bd42858f67dd03461e639e962cc02c258afa8cb2988"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "ed0cc593c6b1f8dcf72a8bdbd1193a9ccd23d4166b56da9359f9e8579cf8abeb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "bac61cd3aaaac0b6a1d03fc41a4c7066b82fe599126e24b3798440dee54fa113"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "1c574c7981f60b3cab2611badc18350213e27c486f3445793f7e09fa3507f36b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ee342ff0127a831d9526d6689566e875104510b161125a2c4f17469ffcd4fa7b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "f39123f7ecccc55c68c9ce20808d393139ceae99a76ad42bd7345f4c3fe58981"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "4e678e6afd1eb47b47d0dbfe6c8ccbfb3fc39f946af7c0b6e5f50d0bebf667b1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "0aa10546a04b5bfded99951e76dcd856320cd43bfef74eccf15d837c436a8f22"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "e02c56ef41c4255b64f7953502b1092bc5777568cf05c635e19023eb4eb4868a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "c7fb92c04b8aba7b6b7610b0584f1a8f9af3cc8cb0e9c80c64ece9baa7ced241"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c3927af45d1cd87171e7da565de818dd8b1ed29d203bfe4abad56c208decc86d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "00583a18c484b658b6af34a645f4fbd08951d6bb92f84c93bbcdcba0454f6d8b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e1cd720642ab1e84d356769f5bcea6a443d8ef1b8388606496e5b25488ea3d67"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "d1fb938661a42b731fcbe40b7ccda0bde09930a2d739d8864ca077b4e38e4a79"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "6d22bf86256a1e88cf9dfc10cffed0dd341ccb8766f3d46c7d67c71da834e528"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "5be4e7566e57d03ae2aa35139d799cb1e75250854401a99f458e05725825339f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "54a62be46cbf5f5edd8a4e0a0ad974b4676f0fff40de4d8ad9c15af50d0c625f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "c81caae44e25100847136cb2a1b758c5475a12a82e5061be4c59eb6a2971cd2e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "5cd7a07bc5868d11c2758de3bddd52f6eb30566f66be5b557d0146f6ecbb99ba"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "8b954252f30f2f49a587c33a2953b0903abe5e84dc0e9e7d339cb61711e7681c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "22da67063e1f0167fd4bded850095dc0c830d6eeca6e7d31cdb25c9149751b6d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "7d13c14994cd6d7829662f9e198a135374676d5229754d0b5ef714c867a1a788"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "5e3800367f6fb8425d0cac1b37d47667348904e7c1d334244301f27c0056bbdd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "64c046fe46126a7a81feb8ee815cf4f90065b3e9d6058fbab3a957d99f1175e4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "fe3ab3ae04e2e57ae01219284673d047076df2f1b8243fc10ba1a108754aa90a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "b08c0484f1368367a733738e4d6a56d9b0c3c9b4fa73961326ce218bd9dfc4ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "9a6c4e02018f3ef4ee396d2007ae5bc8a7c39d49f12e9815770fc65e88dcf37d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "b13453cd2b5efa3848e8e3d6347487d33c6d364a8391baaf334b2165edfda08a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "b120d285b0d6d366c3cf00ec420a9c7262c051f783310b702c542cb2788e28b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "d64d7330c23470798893d12425dd13512530eae0f4c358ecf72cf89c6ca93a82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "b51269bf9861fbeb8f0f887163242fb00ba3dec4b02c91356564d15bac78cde7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "0e7c8ca2baa90cce2d5d0c8ad4e0d665220eae3d99a44dcc6bae4dc6dfe7b365"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "90b9f02f8dc17f5b51f633861d4edba8518b7c1fdcc3c478d2db290653ad226a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "13d223f8bafe1dd03859898f2f12b10eeee27b4e5c8e1e952dfeb754f352203f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "460830f6828fc41e3c666db986b0d92b477515fecbe7dbef74b0e6d9d064fb77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "bd3ad596c1d1c7cc23718b401fe28daf3be76a050426bd80c0a738b3134c5ee5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "40abe0954909415c58d5a129324fe4995b05ddb1d81f5affe78995031bbda604"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "e5463015c7697b21f92965c6753f0a1edc504e4375c3f2a117e888d3c9e09453"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "114447d1d8dc133c97f661864c8b1f024a7c524a61f98e36d922ca6083fd5823"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "c5dd79d2e2957a33541aa1da7594079e534bcf84890de4bf128e65c23a06e5e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "67c3ca34401e8554ca74088209c659993e31f506d663b66fb0fb60622f4ed68f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "1d653b2e0fc2eead22a9baeb9369ba77dfcfd424e3f85cdf3cece215365413c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "2df971cc00004a84f1e603d3165ff341d7383bbe9d28332bee0a5597f26f0703"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "d376ae7dae9a3c2e692f9189711b120be5ba7191ea50e612ca09c7ce0e2f1ca5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "f344516c276ed7ec432bae0d33403c3bd88f1891fe28d12879949dafd08debe0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "5b5fe04b54c88c2eeb90bfc8ff14a6fdf6620c46b5ecc102e2d387eff0cc01e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "7221d6cd2db9fe05cb49db57b1fa936b0ad6ef9665043b80c6c5a7d58d9b3cfc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "8736b51ed4ac7b398d7f044ee7c7c5ab15496e4e9c58414542cfe15d3e3f7d5d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "98d2f3641498ef3244447ca2881fefced8fed700ee61027ee4501588ff7e9b54"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "acca4020f46c996f40911713f0fa9ff98cebb92c491790bb214e32fe9fe0a908"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "b9564b8798eec622590563089fdbd4d82924e68d9d288da3dcdf4c1429c6dd19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "232765a592abdd8aaae489a23ab4ceb5f3d8641601bf483bdf725955065c6268"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "fe7027937a226bad7b72e8e451fe517146c5991b3cafc12d23cad670a1b488d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "7f61dc9cfc634d8e067687486503cc787a891c95b2b6e9f1aeb0b3695356f398"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "978e55a02adbf7dc7184a0919c83c23067557021d32c83243f750e5d51b963db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "2260e192c53aa11aa0248d5e4bc8629b1d2c08ae9184f852fd235ccc1e631096"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "da145d75774d836345f1a79d29b9e4f289cff35cccd76abc44022f7b30ee8e2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "86c85d3a377a33ca7af40513f96194a81feb0e1fb497bdc9fd29621b6e0d2ef1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "20e8a0e197605e01388fdf577d033fc20dd47955aa1df4f813b064dabe74d7d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "4a689ae4c042f91ea96060ac5b51b93ba42126795c35bd92f6d109875326b997"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "32459c62ace67e003f7dfbe3aefbbf21f9095f6de9397ea145976db3343daea0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "f30e0e5a5574ad83152e736b88cfb59fcd675d2e6034fa83a7129a642650ff1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "3d5a673d849df796e560c92ca07cded443eae47214596bcbcaad881698657a17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "414d62f28f288a785b7742964caef136776769e1919d9193a156f4be29d27535"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "93a66ec063295d536445fa9a17bd25a92b6318ff89aaa081d285308d9da15519"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "cb72556f67034b800da456a334e02d0e792daba71e732608f1263e5426aae9e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "bbd3473cae6422a9fac95e58ae50b8f31ba175ac826c726e14e457f99acc80e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "c53620515965a301c4523c1a22ed842f2d4823d7ba0344b960408447a8152781"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "895cb7e35b9a1ea1d6a030bea6236f28cdba17250fbd8a2926e79f4dd7291cb3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "097906653540c858b3a30b04f8ee26ecf3763e247e56328282db5a62027f9578"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "37fd37c984511ab0e2ec5e2d7fc22d2835795d55f9e0137a4ab16211d4ab9830"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "793c23858d9d9fe5b658cf0d396b865bcd428709dbcdf28415a466438cdb52ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "110bccd91ee2d55e8248e57e17821c6e1a269398a8649190ab62f7175c9e0618"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "7b14111e9fee4af595d71561a45a5e38ccd08ff971229087856d560e4a759d70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "91c5425001642e4cfc9f43d8236502c58043a898714f0bf62de13c61b44a6086"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "2be88e9bd373292181a7c89d8a7a9843c4361544a972e224cb369bc560a005b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "48b35a5c8ddf81237c4bbe371d53b3f897cc89d879e4c194906d71b264b241de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "57c3356ab8cad9cda3ae8c8599e111be3fb9febec24304afaaa375022cf0743c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "d25c21daf52785ba16b5fdf3031e706c5ffa447da16c4d69bd0a6b5f190514c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "43b0da4aa1aa669121f692472930bcb5077e02369eb2205e46be1b5a646463d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "4e38797605802fcd135f223eb5713b1390c326f8916c4c91f086d57026a01e8b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "ca3ad27df842ddfc38485e79f749adf844fc15226d75bf7404b9fd9024c83892"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "5691417671db0d3ded65d979ce1a29bf64591054d0409e06034ef67dc8b3d6ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "a3f8f7bd224481b840714a7e9df3f8127c660b6c40499d46e9283a0de1279135"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "8508afd273278a306753e1a4e930faa3fe625ed5f6d178f126263a4c6a098951"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "00c44d7c904f9626f1c5459e8712d5c7c597811497164670f49dec85e314086f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "44cc9742915a899adfa16d2591e648d2db65c4e6a3afa052bd977cff5bfee56e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "7dbf88bf74f74d6d76a92675533177dcec7fcabc17a1ee214ee50875488db70c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "bbce8499a7d9663d7a5038644b2e27cfaeef9f328d1f4cae298a9207005ab16b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "a3c9085a307e80e7e85853ae7531297c38b36e4a092077158489fceef16eaa91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "df97da2fddca459cb9a764b5bbfdf2e771bc80ca75224b925d024fe4507c4783"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "3cc25e519da73cab35ce13d494156d9abc1be5cdfa593542c2cd114cfb64872e"}, #endif // EXCLUDE_SM_100 +#ifndef EXCLUDE_SM_103 +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "1dc9a22931972d389637a7d093c5de242b2d9be11ee538e569d00b7465510908"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "e2c5125e544d830fa36ea3c5c6abcbde2d1bc14e17ea279b76f7f1a7f189731b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "4bd2c9ead737fb43cc77d9c903e4b21997202cd7544447e2ea70b3dd46cad528"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "ec30f8b9c1c37d1ff8340b28734e2cb1cf4dc1156aa174c29489f61cb4eb228c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "79eb093aad0691a9792999b9c30b8a2736eef70fd9ec617616d6c656de2161c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "9e32c3551ac7821d5e009d50b7c5f5b4c0a5fca2fce3d48be915f55368ab0dc5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "57c86fd89ac6dae05a3d22f5072243ccc929fc848e55d06f604943a7f49778f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "026e9ff404c210a3d914d5336b11493e8ac259177a43d82b485cdb3046c313b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "0fecf3fd73c5871d57d08f026994eb60d98c7586bc20649cf3850d4b00872562"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "712739ca65ee73ee767ceef743f5dd37de64baee212c6a97b8ead2e7f4d38bc1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "ac1b269266e48305b586dd778032da0101f9f680c2522d17883511d7340c1e8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "95aa0c0572c8ea82f04c34dc36c6391c222ad7aa51e0b7bf10a5a7897dc27f3f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "bd0a894f04bff4067abec74c6d78e41066035a580874cc2998ac93436eb5679e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "79b220aec5f408e7c3faadd2ebb9b541f96cd37ef885d497781efe5ceca39dcc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "0262bdce9ab7a59952a4a929d0d8910c053ac9f9fd421b530cbc289391d4c591"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "39bf6f601f3f945acc8504db2e73a76c8d2f89ef63815408eecea34f4a0a5038"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "1404f058ec2378eaeac14e0efd1c5c6d747e874c9086d86a5c16d4bad3283af9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "5fdbf73865b5780d969215e3aa3a93ee89f44a6116b9ca493b5c5966b919bc90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "9de471af7346f2bd0838e33f732f57051424d69e89d2c2239fc32497aa9affab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "0cb8b0e458554cdb7418aa5279e41108c38a8adc932cf3a120126b66b36cea12"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "dae359822070cf4d043f0d732ff03067a8d1fe3b286f3faaff1e3c271429210d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "1a56ad5b985abda4628c9b695fe0d7024e5c461fcd0aaf64daf3dd944668909b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "bfc60a1b40fbf227a5a7689e3453b70d076100373ea1a9aaec4a17b759c4edeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "b21070eaa5b174cf43eb600bbee036b2f3479149171be6fbaf2ff3e8d416df0d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "a91acfb7ea08adfdf2fc66836081da7471e3bf69352c9e8b4676d2e11d8c7d40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "1e8b3d40850cb9e6a6f379fc6319f8d0545ec9ec9a5466d2b7f235e0549c9a4b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "6805432b4337530021dc9f880e5c25c7473b600b2f0e86dceeff0305e848f207"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "c1575d1e5176315b5352d1f11199e4c8aef1f6fe29db936e3d67a0ad0564ef87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "bd61b6515a98b35528025d2be6a13aff12236c757263f58b1c2ef579ea375912"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "dd215a183711d8028836b0b6113c817016ed3ef527eadeb28eb286924a167fa3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "2b6c3fbf43ce1f66340511fc0f3a20e641cd90cd6d5d4d301e635370509710db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "e2e612099b39932d114af88b3f5e23aa64ae44337a41a7009666306440495268"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "9f36d0f1334ffff32d569d0f38df5c12a2a8af2a30d03dd82b5d9d2627ac6425"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "88a92815f7381d9737059e2dd030262cf093fe0c03ad45039550c5e5c9abb1b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "b86ec1da633c2b028037730b7e1976cec1784fd42aa165cb5cd2eb73f557df5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "1fbf6ae2289a3306c5d8ec1f7bbadab16c57e56dd742aeefa83fe4d0dbb4a471"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "899ae255331642b3764a88301430cfddc01fe7cbf5e304370ca93c26a1348a3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "332700369d58b045d9c6854fbe74bcd4f30d53e3038a07a8f4bbce9217735051"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "ce9b8df1bdc8c8cb7aba4a52fb8d0bdf81a70f6957ae6de38af8e004d1bf9a38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "96d512e83e9e2d36455a79913a954b1f78a7beb7871905608ec81e1076387912"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "eeea581aea2b494b64b47eb0f476e60e88e522390d8d627e9b8fa6365172207b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "b5ecab4cd8a52a85e25e6f9ce074d980eff8b309217baf143f6337d793fba01a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "603004d1ae3b368a46777889bcf1663fa9f6c4a70113a229971bc24d883dcba3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "73eadbc10aa6dc84147fad535c6ea25dc6dec1748c4938db7ee9c620bb88e113"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "0c88efeb0146c36e67afb7bdfee135783000a8d0eb9a07051d6c3a4874638030"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "0c701e8a7dedb79a63af0b7b6102db80031af5ac3a0859a7b2e0d7d6ba412aed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "8caea1680b03b0e761b9dc19f4e1f8871142686e9b219ce501afadcbe3ff6425"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "c708f8901ac8b8194f8070b0c6d197e12ee8a70a06ce04f4c93bcd676df837a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "bee6d7d4dbd79c0ca6743d3da178dbdc005d035136ea5cc07eda775f5efaf293"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "19d7637ce2088e130b52965fae32ee5e7eb06a01ebc00d6e531fa65107564dc3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "0f53aa3347683c02ebea5b9ab96c2fd8561d85b76332078c524684c8c031c793"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "b02a278aee90048340df103eabf876c3001eede792d71e9f56b823c03e7c1a30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "62d6cc6f8e3fe2fdf5810fce71181e150a35112be7669a6be7ea5825a887e26c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "cfacc3b8ffcd3332c267d3b179cb1c4c028e0f18e4e0617d9040b71d06f74f70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "eee9cef4b815ddf4de0f17efc97266989f443e26f3f8f03677d37782658d4461"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "a39081f75692f136e569ca354654a64ddc3e8952284844b98589775d18027bb5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "88c285a6de7accbc24175d9b4fabe1af42f8f08365e7c08d18a5ee8bf545c2f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "553385f584d781a70ba3456f659b7ca2c17317e4ccfd581afa3d60b2f63e0b2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "1096fdf3548cda0c8d7c34fd5b5ce5e11333a40b8f742ce4abb3ee2cab020c3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "12050b4471066b486d94302716d641d43a2baf1069c2e5e6059a52325323d3fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "1909c4cf2344c550e42479223e49b10294df6b31bd679c0cd4e6544cd5003bd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "6b730bed4565ec23274e88f47717c7e19a7e9b743bed7bf37a22cabcdd4d1b69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "746ef13d6facf3754a4265efe4c545d02ec7eeb021ef2e8cd03db0d34a00cf23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "0088643f39ca3505e15e207c4d477e320ee22a42167a60a0819a2a10a3a99a4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "09c1bb1c6a3a22f8f93dd48553967d679146c5413108e2c850f9d425f56222e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "6ec74d3aba466b262e7ee34ffafda9409d741c2d9e5d0cc052304413f0d9b907"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7937199c3dc895b4a38ffeb51061283fa19d6410f7643fc724098eeef858f337"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "90c4ae6009192a95e6072d26d6fede6d66f2a76aff2ac13546067458e98a7c9b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "310800d10325188bc37f2dff6f1237aeb9af387463c95191b5e6c390b54b73a9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "77ba562322217438660e49c4874f34b049cb3d7895474cee8d372b6f8b43ee10"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "e7670e32fc4c7ab40f121227c326448ed56ad65096bab9e2f38fb61b448b9147"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "aa1a39b3f13d5acfc64d4f610eb851bc1c69fbfae2b11a24ec020c388fa56a5c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "1098ba0832132aeae43a06c169a01eec4fecf87c42fc2faa35ebc109e1d77302"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "4d0ba16b452cb91a20e95bfcbc1cc6f76923d4dc1a3daa67f44dc456ec0ad353"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "d92a5b622747dedb668178030d13e207ebc300d6ad9bcbaa4f7d2f544dc8e423"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "3ef047cc093b5d922ccd87923ebc29bb4f19b3032093d525ab8013b8e9996e02"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "0f6f3e3373c298c9feaef5dc4ba4a0deef9f729cc9fec0ae3275913e37e1d2f2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "5d456471978107393e7c7ecae44262e63c5d90cdecbb513da88d034cd52d0054"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "13774b5fd3c50c3b07bf5932901f2973eb97d5b0d3858600b6742e591de7a9ad"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "3bb73108858d32650c3ce5a58ed2b74a6486cee0c863b4a9b7951ee03f4b934d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "f8ad70d2de9f078cb6964f8f5d45afb0e7dc333f8e9b58187890ec252daee3d4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "89b404aec645265911629b60d2e6759c832365da9ab8c13cf404a010a122bfca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "fc9f2c7e1bb7ef276664ea010b5d8767fa4a71903c16a965d676d3adbf432433"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "580fb6178c68b843bc056f08d8e9e49925d687708a8cda9f166b653290b38aa9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "8fd8c6ce9e4a856e1d74dd2b97696b71439cb7755e35668723edfad32b708e3a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "b81738b568dde4dc7a7daa2c1f0c6d2b6d8ac033379b7806c9dd5c525943107a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "d4336826f96762b336e67e84670aa4177fd824bdd385ce9a1085363cd10c8424"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "a16a80e4dbf40d343438bc5e3f188fb1661315b935d57fdc8e7c85ce6127f23b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "e8e35f0845a8ef03d6e569f8158ddce025cf0bba92f08975ed917e18958ea0b9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "b4c751f81222fd064ac8fac347741aef06029aeb03f6d8d95b87830f2e96a3ff"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "fb3ed79e7ca7690800525b0f33c748ca131f8c9b8cc88cae7d544dd07edef993"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "a52bf14a17a9a7cd72a0e96a76ee45b5d4e511bfd7f6b910041dd29eaca23fcd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "7833fb50701cb08d2063445a5b2c6697b3ca9de6d9adb195bca99bbab03dcbf6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "36090a64a48f7158b78a1cc4ceddf4f92f347dedad75e4b8a80db1d22b325232"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "51705e1aa3b42e55377239392c78ca783418913c6ab932e222cdcada4c6233d6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "08b461e00ed3f72beb93c2cc5533abfa34744b4050ea404437d320648efc04ce"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a1d5262025015120516a341593f8aab84b1c75f6bc401679a97566e54e7b8d15"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "cc9a7547bef175e2efc0f306416e50de3a3a862a5f9aa9fbc33a8d77fbe05456"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "d42558e334b068ad40893e96855356327cea0e4293977a18c2cf2e3b395bec5f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "8735a0980c7c2dd523d1c06655aee676685afe346d7a105868acb739a78e2fcf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "d2907692668f78ef1a015ff20569c44b85142100d333930b1fdcad69d552a12b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "d42c696f2c8580d1c6b6a76fabb251d3382dcae01438d286f8ad02bacb112c9e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "1285793c48f90158b5163112a4d9c8e3ea5ba42c4db9c1aa7e4c0963b0a4583b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "237c360aaa1e1379205ae52d912fced40cbb833cfbe0dbeea8f8e748b9728ae0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "5da6fe8276d46dc89ac6fca51c5b0d5775eb475f35ed65ffd6356f6d752701e1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "50a34a958838f9096800733c20d0a81698efac253f651c5a93dfff36f975d44b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "38030175b1b3c506565bda03e88b5a3550a294f922411c30ac4f55b371cfa5ca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "eb4b0a84fe7065c081c47565bbe3a5844d0cac255f88ef246878c09fd2cd5673"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "e352adbcee47d11f9144032fdcf4694a2d314ed18eab95998eee216eabe5fbdc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "23a829be6cd516ccaac0097839ecc7c18d06c36b6565be32d33b4dcfa05cc252"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "f4629edd9abe9849d764e50f6e695343d1f6b4813ef3c9535bca555ca36ef42f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "7a0044e774e263d837516b8ba63e7b3345a854294c383f47ee6f6dadbe2dd49c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "068e2611907cb3c2a5b3d70c620471b3317ccabb7bea4000b722777826c2ce8c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "fe1934c090931e97713b5e2d04e43dd8e890e0f872954d6d2709a8aa8ac5f1e2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "0ec1b122910d3dda3e203c2255306ce632cc6abcc172e36c7b93cab3f30df549"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "249be7cba32d4a0cfdd44e6a0b1ef5ee7cc0fd24dea343a78a11f406901bbefc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "02a2778d8254f49225e7ca8d23e7afc9deaee73cbfd117925f830ef497f7b8fa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "a91898b9addf588418df2a35e56fd95c80d95b032ed066ca523a133d4b14513e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "5be60dd2495cc56f60e842e6896a985a9a6e95c3fbf55ada692ac735597cb2f5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "37dd1685de2f8625554a58830d201b6b9b4d79dafa2fc6efa3cd80d6f936a284"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "16cff6f179e1a1740e0cc5582e820071e1d25bfff5c8d8d1dd1a06fac371f353"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "78001476bed92930af991562b0ebd27a5371e3eb9723be7a6d38efb84e46c504"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "4fe6999b233f911b71df85cc93b7733734941bc5984779a98f097c78b1061584"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "5b8f67c36acb7f27114bc78730bf952bd33c536ecba3d4571aa25861252dba57"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "a2e86f21e3b23b6ff70d5f302ff60a0b6257d3e10eda970a70b90cc1b224208e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "b83c8f5729b365dfd4a448e78e64f6f4eb5498dab3d18eefbcb1a31fba392c51"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "88b671ef75f55db69948d9ea5a5bb4813531d77dda4ba09fd6fbe57cc1ef4798"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "e0392b5f951becfb2a85d7055ccbc589a7f7e37c58ac2612313f86720d0e8676"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d35c180fa712c0a1f7dca53fa103601011530afaad6aae7728d0883217ffabf1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "3c15e3fbaa4df1774267b438ba2e19cf683b19b202a4d05550e03798cc580090"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "0227834438ca37ae3784c447692820cbf62d7d5e2abd5d13b0a4fae64f3a3e15"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "bd64408ea6102d78e903b88d231b077e9a5c9fea8066f0027d8b74ec2df7316e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "4383016800c762b3c39a30dd7bbea76b9c7c61e48606f1be43c8a9e7476871cf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "19f6e1c008d633bb625b05c1ecaa2af04625bd2117befa183919d0c0568ba93a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "d103f52395c6923bec6af3a02fcb4514af475a5ee7c1d07f831f8033e9d763ae"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "bb95c00b068499f7faa3d09dc13da839d722cbaa30f55602f7b7bd10a1a35be2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "7dedfc875836c5715d51d002f535323c40c1fcfabf1fc5f89dd9a8effa8c6050"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "e5a654bcec4edd18555f70d1746e312524bd67f356a241a33844f9f08304395e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "037a3f53aae6d15c24cc815035f0ea9b31b13f5037d3538ace3792983dceddcf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "5e76a2de0cbf6ef274b6179a9d84a18a56d0ab3958e44256efe321b50321f441"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "48eba7a5830fc7ac6e550af39baafc9e7331d9b392a8cfad9ee84d0cfa759617"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "703cfff5eda207942c2e02d6f25a8b37738004800bfb6f5eeeb5b7e7e5ac9d0f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "724ee7c2bfcb9c42d0247bd8e1258b71d9924b62f563dd9f7bc3121447819880"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "226793521ed3a88fb1c841458162384d95e0c81cfc2eef7b8317e0b9cde8b627"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a167f01f3b4dcf67d982e9f32fb233299a2cfc1771603cf638e7bbece11727fc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "be5f1e5521c8b90d2d901b15ecf1c06a84d5116e3015d8f71b48d8dd8a09758f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "216a684486abe7a66a4be8f0614b5b54683fa93b1d5bab52410b6282e7cc9870"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "da6de73e055a3523b1c84b2d17e6d6c43c28ccc314be3abc3c8e1ace1576befd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "1814db0a0c97327106be12788f6158e7f2cc0113e4ed005e6baa89faf4d4986f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "9d0cd7d6be5ae9e256871763da6f931314c28d54427fb5a304cd510223d03c53"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "1cafae28e2de74b58a377064249e5efcdad81a6aadffb230e366899d5e5acec5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "88ecd0b6fa5cf7dd8ede41b04af63934063f49864a7da642c3300aabae0ef5c2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "91c98a58f2915d7b3695664d1d03d83d54cb47e2b32c8e01118f6cd87416a415"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "8cff44dc0d5d54159ce9ab5cc0ed2afc8a01e19f109fca7d1150b80a00f58886"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "afa4d16c625fa0c056fb3820bbddf93b48f951ae601db6c0db4b5b9d832f9e9e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "3b10632cecb1315bf9c6934563980926e2bada3d976e76f8c79737eb90073265"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "88acc8dbdb61dc4e0401d017c80ab3948dfa426852a2dc8d7225a96a717d905a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "c596459f8ca136e9e6975895aaffc57b90cd62557fcab32b623ed6c155bba38e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "d8074034ceafde8a4ff77d51e9e106512de4daa6036ebb87c9f5f932c84fe7ac"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "e33b58f4e05fce14c991125f376791943b423cc1321a5f31cf54cbf67342843f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "3f2a7db4a23436f7e95cf205fb576340d109dfc746d5aa771c0bc67156c97a04"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "a920c04adac7f4665ff09bd97690199a95aad84fad21702bac29e9c89af3af0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "6df76b4aa4e872eec6e63a430cafe0f01a189f6d68fb1e253b04b4b10dd8cc48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "2bb354d38e695052a05cb7dc6ce59f4feb6c355fd39e775826d53dae647ff99b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "f7b1659c563e39282f64f3e8f7825a756b24d019715fdfb2eec139925325cedf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "95076d1c8ff8985851870b025722215ed23745e1006e49f9d5a91c309836fc70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "48427f34c13819e7b1afd45912b068d6cc720cfa3149325a0cbed15202a8e205"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "377ad5160e27ee2fdfcd7310993969be96de876551b84c8a74a32f6cc5cfd6b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "54c4035c15f77ad0f66f2931087c5cd52793c2a5460d305e17795f0590bc4d8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "7975156a51d1441062f383b01f05408c66609e78f376493cdc468fcb3b397253"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "78bf02dcaeb1291aefb0e8f9a769ae31a88652c17fd4eeb3517fab0f1031fd9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "610653643b8e60b0b21fe676f479b49eaf819fd6b6609039365578682e2db8bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "86e9abaa03c4fe92abc07b7aaded04baa3b9f8de102bcb98c03d4af7da24a4a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "3fa16c18185a8d295549dd695403d63c4c6e3dea1dacc6e5b7f2a9751b7051bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "d6885956434360f8ed72f9e8c18b10e9ac51a50b4ed0f82aa0c3e85f3b9d2500"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "17dd3b0f5e703c2f6128b7ce1aadfd8752edbfc2df7903ceaca081f47608d79b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "7edba994781ac28b4aaf19f53c05376beb7bca2dff365bbdfc0916dad0f282bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "9c210a7048ef89cd5ef657ede626aacf4cba524b35b373c41bcfcacfec786b92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "64468a6fdef14ab499f0dcfb17eb12595cf717d11ce63094e8c97374c7f25843"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "a581477afa4a8093ec8aa768e3e0e27d405b056480aab1bf79e50605f597aae3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ad385adadb03c3fac9122883ca5066cdb1878d96408c5451cf4c741a8745679d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "1e9580d488919c1e5c5a91edb91d30d7fbadcfae7a028bd1850451d89b47cf14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "cd8ec2377e521064546c3eca41177da0cbc78f702e303ca376d676aa901a91ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "29cbb3bbbb86091b6ab26e00474bc77548004cfe281519292afde773178b21a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "9bfe7ec1cd20d16b4fca9a0369635216acbba52a686fc90809ef6ffe21f6d3cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "f20586fd35b0944ad85fe90cce19b4ba466761556257b38d4413453523ec2ce7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "2d6a922c81e20cb86ba0acf858bd51077a07603f6cfa723b5824dd57be352846"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "29fe34818a40dd4f65f0dd3004e386f56e3924e9bea27bd934741fead3f10a69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "e6c14434520a975673a23f04dec1aaa2dca3b7b257fe32251bff09d3136e2758"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "ed08b3cb219205b8ff0b4962929d98917742effab689551824dcb698c888a10e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "af6a1b5c6b93d6b4e24bc28889806b57b53e885ec7af2a0b4f24923eb5a2f8a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "3f060ec21e49651d4d46b42f33171f482ecf3c6d4be39786e50aa89e27e98d0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "0586a29f773ce83fe8d61c9bb2d36d2a8d0c6627bc701eda05dc26c0e1435f40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "f25bd505a571d9ccdcbc98918c26a26d843c5daeb6bf41f1457e18cd0ee0c75c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "b8de80a36ace599cdcb1ccd472419fca909d38fe17174b2cca7ffe7f500fa02f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "82745a98a2413fd2462c22bf8f8b66d494b5982a6e40e35eeb0ba2b59ddc64c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "8c0159b06df04178d84c1b4ee5b0beb87877ad9a8797eccb0cffd4233c5acdf4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "655b89445e85e79163488ad1a93cd2aaf101607f16de29f2e397f79e75d6be2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "4f224b1cd513fe14bfa050aff23579d4f685358a5b99b26429165d4a0b112b38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "a6dc2b4720458ddd22baf73973854c9c6698d21ca3b6a354e12a479d822c9fd4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "1a01c9ff1332ce9954c0ce474eb03666234c19a3eeb8cbeeae77349181f90635"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "59459da6eebdae859a56637c0d53ea8b99fe6b433e3bcba5caa902c47ab7e8fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "cb7a62acd9d57bc1f5dbe39c4f258ce6e2af58f02988df839bc135772095faf1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "9ab6c98b2754772685bc8920ec8f43988d393b279732924983336fb3fbd7ca8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "605bbacdbdd570ae298b36a2bbff9e7dd11ac38c66d70e882908880c00f72aff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "047b693d5585f4baa5b63953633155cce853a8e3be563c9442f1bb09c06ba147"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "d56868c0700919df9cccb57c7f126e04a6b1ac219e06fb9f09ce357139943192"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "cf11250b0d65a003256d41ee99e7c717096273bc1648fe91fa32cbc765eb9b2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "6f792a9d4bd422ddb16ea6cb69f69af5734befe3c440dcd720348b69400efdcc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "d274e2d42c1aa2779382edb8ded6e8ccea4482d60bfec6d78a0d1c0141bde396"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "82e9eb118eec379a956a7c7888b5a8af6a3e31e325f6bacdb81111ae07d40e98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "2bace31ecebb9aea317b1e378d42d5aafcd86ecd11b7449ae59d1f958c2b3c6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "2363e7073604517b47cef70c7e99acc725534f6509fcfb4e08b754bb2831ea60"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "70583490efded0be584d70a7311583776fd9efad490e0d8775a62deacb8402ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ff345b4d5410fedc77a9ae08490636456a45d673d290234952fbea48cd4b62bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "33bb1a735437e117ae5a2855f3547bc0e306546281f8a884fe4ec2d72bf1e7c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "40e5b3a782baa666a0d5025fbf17b5a9202cb2c7cac52c7dd4d1038e31f01953"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "0abc3f70db98be896dcd0c494c1ae87b4a12ea870def722189dc991e4a8cd357"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "1dc033ea6d1bffef99c1d50ed67c960c535c1eda2eec8c09f91a3ecd56a4e815"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "f2edeb38066c0b3d755b71674fd21d5bc479867274b6bb9a59f34dbb67fa9f6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "3ecaa7035c19b54c431932e24abe6d6c94fa1bb0abadb0abc2591a525300e13b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "47c407a45f935707f6ca5ca91f90bb77f6c931a596b3a631d18500bcfba538ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "c66c4a62a1f9a1b96cf8be6c75f668a5d4cc478c01ed87bafdbc41987086aaec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "61810a0d28f489d235f6da37f39cc4f28bdad8f44fd763fb872c4e089d3cd018"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "043419b40265da68ce30f45f347e6168071d5162d943a2f06f246a69a40fb81b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "2f9fa824c350bcc33e5fd87babf65f6af106ce88ef93c37860599e95fafdb2e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "560a6171bbe88f6dc135b760be44b6e9b28d1801972940a0297c5d32e73112b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "f8d1469f554635e03e58516ebff3a229d6c0c80b66366fa5e26b5ff37dd2a317"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "d0978a1c86be0a00e3fbae576b73a6344e90e172d2bd5025e52d7c2d1b49bb03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "e0ff1fa0deaf060e062e447b492b1eba4a2e0e12d729baf794ce99d387aa7066"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e9d9c38b91001054631b9a5e1c684e061801ead3d20130f7d925771f745591a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "03923e987ddb59df711f368c37ff6298a40f6d18e5cf678a0f1ab82862592657"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "db9963b924d7ab874829d19dde742aceb15fceb3746c0f6cdfcff403cd2a77e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "9be766e46497b235537baa810711d4e6e1dcc1b881242709bdbdf03c71a17be2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "8bbbc22534fe8fa99180cadbd098ebeeaa704ef2cbb4b43a5c308eb0203c83fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "08c9dcba844fb881446997fa552c0644aa7056d63d01546be141a774505f3240"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "f193c805d43eec0517bc16c37dbacfcffff6db6ce2409287cd0ae9a4b4ca7fd9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "f141b71036d9c8b75fba560c06685b104a99d1d0eae737f1df8001109fde4f9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "504c88895237837b3cf6b2a7f20571bb91f5436bcde3eeb27a0c53b1ab722043"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "ad4deb7c54a181d4e027c7ba05710dcaeea1514119d54aded22d007f43805710"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "0c2f6bf207042e854b9c6e99c1e4caef399ddd1aab440214f36ee61849242422"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "01c71bbccfb8614d956fd608d37078456f6a7b1fb718868ee98c15767666d7ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "db218e2011868cba1b84bafa9becd7c3c705f5ca4f4009f08bea0967b8d03bc1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "0f10de91611bbf9a06c027c49ae5cc859bfe138896b700c4fe407aea364142f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "9467c94e1bcc49e5c4d03f4b0c0b030cb1247dd02c430e64e26889c00e6e4eb2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "0278d5e9f32ce172120c5761b19af0268776ab0d6d3230f54c45cbe92858e0bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e7fc0c38b6ab775c0b60ceed78ebce946c72f1292369f1f18e8b490b431e0bee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "08eef5c2e6c30d6cdeecf5a8f89d8eaada62de668f3104c0e9a0a69611e8786e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "bfdd7b7f56a3ebc424cb98edda4000de82d3e3063948e73ad2a7be01ac248de2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "6c52773b6270856e9ab1855d3120033f8f4dbe193640ad28b9a57deadc1d5ae0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "88b9ec259f7c225e27c823bd08c3f9c0d280b2432ea4635f2fd4b085c19e7203"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "36ee9bb4b4cf8895a2d1feb13e0190bc93c5fac6a75c1e9f918e01d85ee0fbe6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "d07f8b12d0e33f0c89b40d69321450fde75c631bd75c133229fb9ea0676a7956"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "b9e0476e074427eea3f27824be335d3c7ebf5e8a3aafeea85d49e3f68f92a927"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "a41cc363e053a04f03fb080ef81373b31824b4bd5777c08a17366a609820c347"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "f811dc8629680f8f1fda550b9cce3c5f6a1b426fb96a56df8a2b9423cf6b78ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a18fcc50777772b7955264bdb685f45328aebaeb707eac89180c138e15c2ad9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "57bae548a94d46a992079e172a29830bbb1552a7c30f8e598a6612cc516f4b5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "9edfa67918fe824ab6dade9f4c92eb8929fee0b6e973da129d19aadb27627536"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "3419cc48403970873258a2938521a3089694a9674fbdfdef3901be7e4b109f3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "7505d4ea34844a1993fb111c71d7cfe1053875553c2bb10ec9411a57bc4b584d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "ed8dcf1808fe8762a03fbc7e28b739ea28cc1a4dc62f89dfd0276aca40beb9e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "7b0284d8ec55f91941c71a13149d5be68f29c0897df8461c13bbddd1f0ce14d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "b1c7958ee122f5d506ad19902deb2886399ef20ca1009164253dd9d5ec634d40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "5ecc0b3c4a7f1ef2f2b27f2a5a3fff6e9bc4f974a363f0de0c89486a275f43af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "723236774e272dc4842909ef7968b42c9725606bf4e53aef803c655041eb69be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "962400d47ba366a1993767abefa4690d52e7fc535fd679ef4cca14d7678b764a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "dacbcdb09c51ff7c527381d38523d3c02aeaecefdd5fa51d79ca3020abafb557"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "58a5a73bff14077cc7ff47817ffdadaf9db26379bc0db8e8dc60cc1d2ba432fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "d0858d37e14a0bfc1feccf7da15e05ae8702c938696f572268c9c755b1bb441d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "fda695261d40395882a3e3a02499da90d5a0d9633f52a4d938a7d3406c1da384"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "2225b3a420fb6d2b581e3e2159481e00f7b99e5f80d1910e80b73e4694c05cc5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "65366de99e2b64233f292885ed8113ca5a08d5cc360df05acfbc6c7178105175"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "ee172788dfc173ab997f2d30ee7c139b2cd02990908147a9cede09b695154b25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ad9cfb205c183934b949fc27685d7e049438f6808daf700aa2a7fa79874b3d3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "1ac91b3701190f156e192fbd3578a7b6ee70bd43ce8a49692d21e7f2a3b9f1a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "c12575ddee8a90e80fb1bef80c0a5a9316234fe31cf8127140affacb2ebdc273"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "08146846620c90f84321878736509f549ab5b4c58db341d3c6bdad70940133c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "18d016e2cb236ccaee49f5c11546d4b08bd0e3d01aa54e93c34c6602aba603a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "31a356db651704552031a264accab649b431e6314125f5d9c053f903b907d898"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "90537de1fab7fef28c7015e9397372e8fbc846f3afd56b2d80e8c00e7536ed74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "55ae94ad7d30536ea00506015c1f68281f6b8e8f133a2d5cf7ba55c93ec98bf2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "e05a8fa5371cbd26549c3df2860e8af71741cbab23b6167ce4bfe0fc887ab645"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "2e078114e509bfc20eb5e31d4a1648d85e4bb6d457257af44151d4a2d3634815"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "526ecca5cc27935f5f935de3a6b3c6f7177f81c6ac36e6489e9a1f1e197c8f63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "e751d4068d92f62f8ff7d8a61124aba92652acd754854afe9a61df840edb7979"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "92fcf599e2af7104258efd8d23224cd9f62e8013e15e7b8929716f504e474f7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "89546e80bfb32d58a8b72453db5dc1fb72ac274e5c02f74d4e4c0765a7386428"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "ff96c43ea42adf745b516850b73ed3bf1873d7aca68cbed323b90b1975145682"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "2400a8a591167daf3b0a96eb8ed301c0ea143e7040dc3bae64b7708d4e02ca0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "4d9fcd44a658b3d2efe114d880f88d842b0ee2dd2f98329ba08378a6227e837a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "4852d1743314df037f5276d08ccf99cd67110444f403bfd891db3b2b92fa62da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "9cebb034f30db70404cfebe3e9f6a43cf164d24d6f2ff89f1f12a69125158c10"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "b03914f0005c2bd3249585943115dd863add9fda7bce28a56a35ccf2f5a7ea14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "4e2a9685b094814be5e746621bcd2e9985199891c171930e09fef26100983057"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "064509f75eed72d70d4b3c84f43dfe8594fa35088802ff13a667d9173abc6210"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "34a670a1bf40ee7cdc27893b0f085eea4e0b08b945e3096d505fe137b7f3df69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "4589cd30cdc67f4175316f6a81a6af9dc3a48a31354ef248546f8f8e709c5f5d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e73fa058b722aab3cc0f91634ecf6c7048126b41fc7dc06d63b73e4ab7ff50a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "a2a34f53b61a13bfebb8141c56572805be437e7fd8ec68687ea85b935d8140f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "348de19c4ddfb27f2bdb347d7ef089b1b79ccb3d9fa5d66c9dd38b4f3d22f324"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "1f03c42003b399757cf0ee9445adb6e3ace190f865fc4988ff0470f2565bf324"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "00e46a72e2551d6cd1dd8d4e0ddfcba977d1a61fc9d7e2aa05fea825e0cd35ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "3c1d64cb0841add73e08faed1e41bb5c719e2eb3cd8348b5eaacac869f09f612"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "e9f7a1f59b802cff24524509ac179dc8791c0dc4307b70389c17a0df3bc5eb4d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "1d655f83d2eeda21a9fb9dfe8d0c120114574a510514f81386b6fe6fbe7ff33a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "626b98fb4a4718e4791a36529ab86c7ed15d432f5baf79219434c6c6fc8c0fcf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "7e55ead2b9c911759af6e68643f94dbbcabe19152a632d70d9570c0311ada960"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "4a8f4399bf17691752f18e057511881d851ee57b03af95e7d646ea812c6eeec7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "5cbf9b6a5fb110973407c88c71ac8c77d37fe7829e430b47d0da534271f75186"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "a5d73ec8ab7837405aaf7fe9f80c38f0b594f04df1f13780eebfe1bdc5dd814a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "40c49cda9642697325e2502964e546c1713479c8f7dc4feb05812e050965be16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "2fa1f0d3389af95a9e26f82ee9973273931ff5c7caa52eda48641c3a061aaace"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "85d43ebf5b0602008810448105d04cb952199cdce9e02cfc7e0d742cc3b741a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "ba71f3e7076b307743cb9e3bf0fbda1f9b7dd8f7e0e10e1f51df02497c02b65e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "9a1b8c1bacec6d3000e35843d07d90e9ba4e75fefa45184da54a8264c10dd2de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "700aa3ed553f67e11df8aaa9fbb9883608c275f307f135a3f3854c0569b465c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "9a770c14ee974e9f887f0d214df043e01fb198858fd4ffaca5cecc0b97d9057d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "c6907945b77c46a7a655ac1a3ab4864fb165ecc8290e53b1dd1b1dab8cdfead2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "03315f56bcdccacae21e6c390e77b536b8f206b4f1f362448dbcfcf057dd65a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "dc924a08b96ac20e260b0f24eaa4fb63095f40d528e00b437aed6dd98b4042a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "4ed1959d0325a5c34eb9cb0ee62ea75b4aa7436dc9da8fe875e7feccd6ee7524"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "658930ead7a0baea0ec88a4dcd4480650c789a3fe8138254cc7d1bd219f0fffd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "47f6833888fd44f592fdcb4b9f8073049ee3e231bcd16202b62926b19c9b92d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "08792b79fbf557611aba566885cc9f5963b21834e0042d4a00bad2b49883ce01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "da2718d8233a10b2855ce3df726681598de298f51cccf5caded72d69392e73f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "94763fc4c50d1e0debd3d413580c5322243f5f084bbc26cddfb7d3ba9e715a24"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "fdceb25ff9f783ef38431035176ed46a88159d87b67f54bdda1303e6a68ed4ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "0fc19a2cc5a247426dff98b811b4e333beca1077ed87e3c66a9972de4015f0e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "33408a1a16ac799821a6ca4d7eaef6bec9f26d9d1d11c09dc774fc71aee3a84e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "047e9fa62606f4f259927c15d43c09ce87b345c4af356ff73e34da42b503a949"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "b2d34136e1b5e0a5bef7c504529683d474763be99d383ac8460d066ada980c68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "918d38495d1efd9f39ea6f0ba946ccfeb9a40c20a07af05f2a159dec2664080b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "33cb26a59c61d6f58bd2e5c2e75600031d748d006b06118588bb55df61f4edc1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "b4abca790d7a075c500203847eb80b76a4fb1b95b2ed86ed2d9e710866b98a17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "0c915d55ef0d45422bc30be858baf0e75cabfd79c31d468687386f7c56c30996"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "34523a1e69f14bd74eab7796e270547c6c41a4b04149366240ec7b16b235ca24"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "ed39d68bfaee16b4c3825d178f210d511d194e535610e1fd48e9fc76b9502961"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "03359790238d7238e8a893db7acaa5a14532a844b52be6a6857048183bade25f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "7ae41a51c0dce5333d579b778aa7aa95661e55f6822c7f3001b0f5604cff7d47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "6c8fe2b6a26b81530abc0f1563d1c7993cd9ac0b83915de3bcf312f77d268c91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "b4a47fb7594d2138ba5a5911d1e4a5c0bbb18bcbdd01172f2e797e229b6b60fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "22c742bab14950ce3e18730a6f6323853c5f3938e9eb18d4434a4b68b1d88838"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "929f29ecabec534bc6f9b6823bbf462a291f106d35e77aa75c52895eb5e1157c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "626254016393ed000aed13e0444a11b526c3df54628d0bbfc991e0c17b349af3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "a092eee0ff962685105439fb3a21aa94c5a9058dd232748f82a24cb6eb69ccec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ebb775e153409c82a70d78ce7c6115b9cc4367d1ef2e1951a2f9de8fc68df2d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "8398fd98f8c4bf5f92d04ee34935047ba2dc7bc13ab768ffbb54bd523bef0939"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "21bb569bc933b2b8e071445c55164479a7b22ef180d303642e07322d8d543f3a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "b28af668026cb6b924ed5ed631f1acebb5388bfe845eeffe1141a2e2c9c8824f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "9d6b859a7bd7bc8faf4881c27d8f44a72ff2b54d4b157f0084022f316ac75c56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "5252d84a67c343a059ac0bca1521cb64bceb986ddc75418fa5d5aee08e22ddaa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "ee22e4777d03d3053d2905547b6e1b532eac56900625703dd25a5cceede6f7ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "a0a94635e21db21b98a41e649d31a42b10ee35c07ddb5b3cc32cb11871debc5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "46befeacc75050b80ee9194f1788b8ba809bc984886f6ad1804f990b90c575a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "c8c9604074f61333dfbee19d5cac059e9016962a5a5c9374d5b183e0d331c100"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "7be4d84b1eba9e47fac99ab33a707426480eaee6fd4b32e91e67b9b753a136e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "16d297e2b47968ef771543d489ad44fc820a330c6a61afbaadd0c48c579188ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "816c06e7d9efdad2828e16106bb1ccfb6d221ade42a5d72eae72a347c43153f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "bdc1d9d2bb1404f3b7142f2c155f7c14590e9443a5d21275724a21ad28254146"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b681d9872010fcd1a4647a6a7613f3877ecd0c51b621ed4f3babe3aa6a239967"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "9b0007d7ad29ddb1eb1ff949d9e6cab45afd7db09cd4d9573718a14af5a36b7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "863c1f908f1cb63366a7dc172246ded311de1842a15b1fd692b699325e3754e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "cedb6b51fcbd66566d5c34b94109fd4b9e2fe9f77fd8f186585f901b3914e4f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "68bd3385cb9c0b5813e88c0d72c8b1d6736feece4f0d4f0fef144a8317b79263"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "75c6104f347caf790402c93d04e1bfd24db9ed9d217dadb38ffb60e241383fb3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "7285b7c5afd413ec3a2cf777d51cd187f3b556e3625f617305d8998cf112c975"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "443e8d0f337bd604000f1cee7374e2db644767afb29d774dc1e4bd937955ccf0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "8225e78db5193566ed519a0b27ce60feda7ba5298506a5dd07910f074ebf8569"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "84082e094ec293426458e51f92eb06848c0a0b1c30fc641678a1f617bbdc5bc0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "4a1e28488a185f641cd64f0653ab10ea8f6ceb34cb8548d634bdb694026ba208"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "64d4b13ee0da2a961154ef4a069ae09696f4b8307ea13d212579f0640960ee6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "39c867f6594abfbe9adc5421899e455f0770203b8c72e6495c7ee0cecc07ee51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "d6da2183cd33afc82b6e99f9ad172b0c54772dd6caaefa8aa84ee6cc1128c9fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "c19f1348cab2b35b1bad2edcb30e3685778f1501d841e9e58e13771eeac8d141"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "128365462d4beea7b9d5b5475acb5cd41aad0cb450344570943b3c3db63be63c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "9675d20748d58939d353153db3e74ef9b2c4312cb5892330d5be6ae1a27b4857"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "fe6b5b7d59034716264782d2c4ba30dddd1bb208e2581e3f20b15a8e9f314e4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "1d236c43e7a59d497a8358c374c0da71afad508027fdbeef17c9fb8f21013086"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "a27c642b5c47279446235fd60b9f4567575919aaf431d67a5041874fae985e18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "d118a3ce726e449299179bc45710313c48d46a0a50038ac7fb201c2099e8f6e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "f8ab3972bb5ce3d35a228c1a028256ace49970504f9833e03f753496c0012f7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "c694f5393b7f056c1108d6aa432832013ee3145c5ad6c6486e63551263afce13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "4862ea07d42cf5c5c98981fa46d4d7b84993707fbfa6e2d6b8f31cb90f55e91f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "7d191decb1826748a7d312469304a44c2c0720bfa6e8775d35d76f41e50a9bbb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "8a94d50d4f9cfb21f3c53dcc334618c2f19ec4bd9f1c90c438801c453a7582e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "dd0ce262f60336f1ba161c2c03de5ad2a504629059c7e645fcf4e2ad9b543595"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "26df6c251fdd82516a1d7360a389fe23dd10abcd184e0f8c03ae5b66489cbb49"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "0fbdb98553c46258bb04b540d45f3022b0d37035e6d05b4818476b4106d25fba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "d2cd2a86081a75a44391517ab2d87c5a8c50db530b5775328f9d9ca55916cade"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "be2bf307eb2014f9f6cf630f33f941bde81ea0f9f6b400badf47100e58147949"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "4bbbf3c497519158505c57a6d08b4ea64664fb7bb6cdcaa149ea0157bd05e8bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "0fff87ab0c5ff28e4f5578c9af779cf17027a99b8c9bdcca81571639852c747b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "9e51ee9834a748e63dbaf68c854c4e80201d46ddb08299463331969c97b58c3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "1515bc1b02b7b5670fc182180d2cc3748c633d98671ed79cde13c802056ba65d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "d7f63794a5b417a80823917fe27da0fdc076c74b5c80f58a12c5cc07dc17caa9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "f6476811bc22bb966d0b16e8b1db73cd472ba59bd3ecc2598fa13338eef21a4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "bd8d3634446bf2d6ccecb39903d051b8a945da673b7af615d1b4b824af87d126"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "deae90083ef0611641073623a0db38467ee6a0b79b25335d7f799b506b79cbd9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "f8ad5f2534b4290bc2c7ceab4b76fc227f188229c4364aaa11c8f8310a0ebe21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "91fe21204a8354db48b7afe927d20529ccd708cb5a4099800ceaf4cca9b01ca9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "4f815ead7bfb8ee9333094ddca2f8a6d6d6529fbc00a0468a2509ac8aec69393"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "81c5e695626f7b5f2f576912b3dcdcaf88fbd5edf29c6d246a3dfa5a6d73c9f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "cd3e734459d6822266b2f271fe71059f1a14e90f807c6c0efd7f27dea9b3bc2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "e335ccdc30e282ec9549edab03ac426a9621e2c72e404dd4fff145a772387825"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "d6fcdd1592cc5569653ff1b4392d02911cfcd35f03135c0c56e5e1296087fbd9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "c7ee3d2951a6c8701239b1a60767ef7c1f2bf7676aeb73c8b85c4960bedd8cf5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "459e95943b5138a44b37d99aec3f07f995ac1a39adb3b08983214bfc6ece9e35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "bbf79ec10ab4009231f33bd1c27a0041523102e095f895462163f0efb3d9ce4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "34e7d3f1e9577e583e1d9e468e2d4ec4f3b52e043d0188bcffe02d424212e49f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "494ff1d7be4577d33a850864e5760652c5609801ee49233aa04c56d11f1440fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "522fc69c45d52a300a7b56a08a77753ce03d6e96f9c16b3e36e9b4d190e7369d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "b0c0d1ff493fde399490d0eaccabe5a4ce37c42fb8d951147cc6838166ac829d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "86365a93241c8a5979ebb69375c20dba08f6cde1a3b290458d70ea43f3ddea03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "dc863cf55e1db84f125052741bd36f3bf6a8a8e3456ce2136ef1a26a5e158e20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "f19630ed69279c6cbc579c8fe9a14e496dfce7a2b7c96348c2bfb0cd3e4f34ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d42728a04e93cc9d9c00d645051ec27cb0f7b54787312e7ce40d1f84832b5708"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "fb40314b34da8a36fa7e5295ba995a739375356ce8b50c6aea54035acf628353"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "f1fb1f8c600d0d2bd6e9d0365aced86f013f7d103d1caa78ea6805d82fd30ca9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "486d643d958e8631356b4a87d2856b107f68e9932765dd8e9271dd99f7bff419"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "400b4ef62f60f74800b560239e2cdb0d389adf3c572c536b0cd11ab8dc720361"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "7d55963b7c995b51e337c91dd0807911b21cb38386f4e8c5e57fa0a7a1c019c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "ae8f84760b88ed1190bcd6404a6221d5eb6807213fb433fe61a9600973e7f4ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "f08f9ae1f3b17fcd3e0008cddbc3dcb92a986cda17d7fb80cfba40a8ed9b2db2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "a25c0d35000d4bd679d36c658d25c8d2169b6231a7d9a5412c4784be8802ec74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "33909729c0567623e9ed10d280619fea22860ab5e77bbbd5b9c22006e098b727"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "faf1412a6b35a6159c1e828bf327f4b6d8b6f42d2609ce3e4d7a1be6a6c448da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "a301948544b46252d51342114b3407dc93880100af61b94a9e7c94c62ab8944d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "0a157fc12f2a2a66eb86c9cde3b66f279914cc096527e46612f43b85c23ae043"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "28292d436c98a46718d5fe5e2b481915b7236c62105140d0c84509f97fdeada8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "45c03185f0a80cb8ac0f16b869e5ba5352e381a6c0ef4a08587461f6b9dc8a11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "bc13269f7cfdb39554b6d84ae4b1ceb8a6139afa5aede066c829f72c2f46fc2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "17009c58a182bb52ad6e328cd25940c3867fc1a03d82c4b975ccdbb34ca0fdf1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6e3607ac4c1a5ace707450d27c2cbf95fc0af573e0bf61a42cdc95d9ac875157"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "958ae1b2141e8a7c62012382131f1ae51b22d849326c48e12297f9ba5e05cded"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "97ed5c6df8e86b9d08d6c8b0f2a0e599085c76666db022b33d08313d9a19d7e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "5ef34faf5683e8efdf56818b63512438d578b0aa083a5e225ac9d28bb6527457"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "a1fc371d93f864df7f25c659fdb703116721dabbf1a830ae096bbaa8bc58ea27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "481a25ca2d9ba09c259d679b0cd04af9b7a4f74aa5ee08c096208605734bd66f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "ee2c03d0a5ecb119082db160b9796c75c0b3ff59c001237f8ae4984c2b7dbea3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "9a080cde8a0f584a18378355740630670135a5e8bf31b8d57a22cd8671d74a2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "b582549629c61a704b218c84e831bc6f8e91c5a964f6d7b5f73ec081b189d765"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "f41a5cfd13489e26903d4a1925080ca909c0d906e0f4be00f1411a43b702456a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "397e502b1c46b750be709d4fda2b5b4f162bc68570b425821a29d8b11224b8ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "270a7c0ab30291b0830445ab9dc6e5454f3fc355615592a7431ac51b634aa053"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "76e2c1df1e4f458089e8c222cc7c4aa7f68c69d098af4750de99c23dec88ca70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "4497c629779012c6ddf23e22cf276f7820c9ea1bb4a793fdb314cd9d29ea4381"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "f2a9becaf63cfcba630debbfd93f34390ae666808665e7d581c25fb0e3afa1da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "4328e3157b5170ec299ddd35a8f477439cc2d293ed72152550054d8b1caa1b6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "1e06a12df828fb19038b9a9da5b3a33eb720a5448fac30f71915463ae6b298ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "e58b270294051cb0f16d46a3cfa22abefd69580821430c7ad73572ca07a959ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "3d3ca2baaca6856f9d95394a578c964cb3f2010028e4ade4e5d23b51074406bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "b44a32ca58c4cdf91156b4af998250f20c8bae2a00bed5df231c8d2731fbea73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "d531baa8d191b506455dd60a8c87dc97045b2fc329ad887dc0039288cba2440e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ea0a4ef82b32a856ed965876bf0822a8adfcdd697ce5007a51c93a4dd59a3e0c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "19f49281bdf52f40e9ced0272992e361312dfac757c7eef657cbf27dd5a129f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "24675a5699dd926ffd0c6e91d209f825a81264c35c680c6ba857da2d1e1313c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "d54450d91e5b14f041af3ac02a94ed91a7beb340064120e6b9178db1c45494a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "b13016b3fffc164ce7e863ab8a5f41cc550043425deba0af2b1b1edb7ea38fa5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "8df04fee5d3b80b83595b02f3d71e23d13c44045b5fa67b44f99f20fbc8e01e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "8c2c5fc423cfb05a3d11a4b8cacf2d8c2ffa37bfa2260b1eb9f2fe476b584b19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "28d9d3b0a306f7656f3c9402a148c72eea9c0b89fbb9a4172ebcbacf8974bd7d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "654c1df56eb7f2385a149a03bb02c514f9cac8093f3db35be3414d30972878fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8b66a033206615d1a9d6afea8aafaad89e33bd4e96af2b94adac576c6febffa2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d7a8a03af5a6a3d8d8f709fa5557ac59d3736e69b92ef5a6fe59446c811b55a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "cf27805a96d2fc6eecf9be39038695f1d5da7aa267bd9ffee6b9dae6ff8d094e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "149f4bcc299054d8990aeacc906554fa7fa7a9aefb1636dea24ff6756953e8d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "6bd43ffa6b4073afa6c2149dda78d1b739763831f9e655b26da679473f2a7eca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "4d8b4135aecf81d09cff36dad6524b4c46b692883ae3dcdda4d13f9160a3cfc4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "5b0a4667c5097650eb2bd41e884827b2be2cd2f473d8254c15340fa075936b73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e0b3155e31a2fc873760ca9168fba5fb9b20f2f24f438187ec1a42d2981e73e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "6d533e5028a9499f89b2dbee38d405e8d7aad3916f28b74fdca0287da1a03002"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "3bb3aa175b2ac2895604b847f249a172f5fe1a007f9b15c1ebaee7931dacdf1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "8f1e458638c3dbd2207288327dd94cbd8e8e88bdc3afd94feb3fa2ae798bc9b2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "864720ab51183498fafa0560ddaadd2cb335aa50202a4e9ed589fadd561adb78"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "128404470c683062486b7a2a201e70454f2b41945ab95032f75a4f80cd9c1676"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "9472ba8a6db5e9c963de3e6a195a2969899dfebd3ee61c9369b33e10bceb1c52"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "c30e74331f0dc7bb4ff9cdf3231b34eb019748002a87aeb508eebc77279918e8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "236b1641c23c66b9cb4ac1523368a0ab413e521f1bf82fc6c79d2edc751a5176"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "e49b94f95b9ab8039fadd111fb06131c8e3bd922b558d56f7e63e8fa8aa64a36"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "cc307cc95a1a790a0f35b9e35946391db21fc18d96b161c57681b40b583d5424"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "1a4b909cc54dffe1242d8e92808c6acfc5ffcf3722eeb2ea5262aad57b23827b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "1e7a4a1a25e02fc291c1fad0144f45128b2488f6eaf48ba3a07d338678926857"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "f92456a66f05079b850c37acfc474e80af7a44d4d2d2360a597d7f1da233ed9d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "6d3a2ccad7370fa92749f000cc8e5f90fdb336851e118bcaf95b98a488e97a42"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "aa938e580288875fc6e02d862b84ee8c146b286074a80c364dcb8b54fd89ad88"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "c490247d00bd5db479ab4d2e860f3bb2e97da5872286a38bb488d34baeeb67a3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "a9b59124b207b3ee40f997c505dba866e2e79598cb546252f8794c16a63ca295"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "ac11b5d9b8196500277c0da2a2e2e7ade9b59465e878abc0f13b7985b2183aa3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "ecb95dd27a52e123e20b8ab844b8d7d1eafcab595d77f15ffb39a58918ec0085"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "14866d21408b7d67ac536bbd58c07b4fb0c8b0e78af47a05848972c35e25c756"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "c8256401d9cd2bab38280474fb3cf580db3590d3f3b4fa923e68702d8c72cdee"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "3aa162d818d8e2d0d5b625a83316fb057c8e946cb61253f918a02514280a0d57"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "f725b6d56eb8034cb713f564ba7f6d67fdc56284de8f31c4b21377aa6cd603d4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "6ef1fd9a8bf61b7f0694d6f9c73e3815dd5c2e054e8ffac973c4c91420e21604"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "73df4452aaa60a3fa4716aeed328aaf4830c43de4feefb781c2475448dcf5a43"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "10b297a9b4ba84b93d16c9fc3186d6fdaae750446f37651860cd4beff9b19ec8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "867126e107883274a676c62b31a894beabbda0c81d20c4b8abb437c50f6f992b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "128ecf1bd7af0a8e0c18369e25bde9493335747aa8c95b85072ed3be60d75f92"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "fe21a234395c4a6498545523770a5628719bd362b76c167e0924bf1f7804ac51"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "67f92eec08df048f4b98670bf94da7e1a88a12b4264edcc9592e735e2884d68a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "7f6bea9f1bd25724207975d7b019e956ad8c851aef7a6053a2b7610550a53b24"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "3a654b2e92c1d602632538b31ff954bec1fc6c6a5e950e3b13dd5f81bc804678"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "72185ebb20fd75ed2462f77b846030aece8ca4e500d112a00214ff5b1d8c3584"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "00e3c138073d3e96b95b3527ca74a0928628da16f8e4bf6598eaf87c45382914"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "3c97321d7483c57110ee458845e028246c39d7bf3e9456890fa89b74aa0cad52"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "5a612d27125679b2efa6f04df99f28ef8f9c75c694e5ddcb27045f2480214a03"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "d71e06059410ae258e58ef1271e42ce2e3bb7bf412af446d349214561790552f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "c9664c4c75b794567cb51a8dd09d98386c7d0831703ae1d1313ff737bdecbec0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "44a616f36179c9ce73ca25714d1b5827e4c47a6438e57ac3621e54fee64451c1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "16c2b200e43e5f0ac55ce7b4fd949c862ec832ffab019b478544b19992301656"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "e8712a3b2180b4a546110df615e83607717efc1e5cd8416540f23b1f54e8bce2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "8dee9557b8ecddc3ec653cf11c60aa0f0898029e73fe539c9085ae4e81e6c0d8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "0032a8d654f8de832a78a1fcc2968396f8aff8695cbfec95dd2e836f4742b667"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "10fa225b4d6b289bbfd1e5b51ef43f2f63c73b122600c815476a27695e644afd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "7b64cea5238646da8e0b7f9a5e5e06af5ecbaa20b2605587876658d05d858fbd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "b2d76d9f1b1878165c84a54b78e17782429f6fffac066a6a1c94bc3cf08b468b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "02ecd778d4d14eac82876ed8db30aa35bffd42ecc07c30b5aca9ce0180353ade"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "2a91a4c9e0a96b16fafac4d3257acbe2ef9fd90472492a7a6f2ca2f4f93f38aa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "86ac6bb0c3e98d64788032bf96f3b99ad6d02340b10174a146eb1d14506976f0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "c422b9072c53602f8e95d4786fe451f4294bdf800023b8520bad2d1af196a65c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "c8c656c8d43f3deae5461e6f37bee29e00949112251d55aa7725a0f3dfd23804"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "6a90bc7928d10a7fa2dc8b3db63cfb150504a352c1aeb4c6c64c205b01610e77"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "b390fc51cb7a41ab2ac92424aeb863f05dacc28282c0f3ebd3e764706f8fe0c9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "5e778668dc4b33c8ac8352568d5756050312854e63f2923c458d9a4baa3da854"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "4807a2135e41e6854cf0beb92c17be031f1c75af96a54b4c4310571eb8e4c388"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "a92a3181cf44d3dbf862364dde44ab134871410e88c0b39569f60d0dd8b9fdb3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "a259259ee38b1c118bc60c5cdb5b467358671035db29ec32538be86cc640c50a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "133562403de2e22598d31fb916503c97b8eecf60b98dbf4d92c59f79e439f017"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6cb20e2dce17b0a3b84e4882f8fbf80ad1d787b642290b6a97c50dac0392bc38"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "2e5207dd033912c250fa90af395acffc4c43e3a884683e1ebf1c02507f23856f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "ca73365fb627059048fbd1b7273394fd13824347766b2c1cee8caae09c8802a4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "0ddc08d5326ad26835b7096fd87223e9be45ad1db9691c547266383c7a1b27f2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "3695a4decd5c7f73ab75af636558e440b62ec37f2c9309e258edf566e7c28ccf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "a1df4416b282e02c01e957c9a9034f0526b0f497f3adbbf5a70238232cbb758d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "ffa52c8cb59613cea7cc4177c1f395bc63fb1078043027d20debae0cd71cf682"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "fde3d4180933cd3379790f59ea7c3020a100458fb3eae3d278b6ed2489f57a10"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "734694312933d646c8c860c464647883e5a83ba0d6b0fcf23b459522e1f382c8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ac1beb38027787b1ed079e68bcfd09127058164cd57e2181ebdf44914fe99808"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "ccd876a7530de494a2afb3eb60589957bc726995ba4b6cb10aa70f847aad1550"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "032de8d9d1b6ffb725464b189423807137b22f3b7357ae11a972f0c6de93b3f9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "15af9a2e6af42b7364d547b294cba5cabf1b2e8d6ffe2952ec8988bd1fe1a457"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "6441087cea790d77fade20da4883595cf936a5a75e454e0bca22ab95276257e1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "8f41bd6669c263e5f985eabd24322aac4047644f114dd432b596329e7d2a5a4b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "db1b1110da1e2931ac06aedf8a1b557cf5ce5693f06c273c86ccb0d7053cf4e7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "3b2c267dee91d921cae0af74cb31f7cb4bcf6694f977416cbc678d6eb9608688"}, +#endif // EXCLUDE_SM_103 +#ifndef EXCLUDE_SM_100F +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "63cca87ae68bf74e4997753db08889ecefe3ee863133daa15f2d1b50dcc15ef7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "5b9b46128e67de512553d5b66381af307be0ee24199652115c636d18bf0a5765"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "46689c1f052b7109c0188e24c74e1e07a6b44c6d06499f426486dafdfbd8b5d9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "3b4f419bb606016b3c522585e8eb77b40edf165845789ebfd4fa9c5a8a9e95a4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ba51233eebf0d58a59f456d055f51a9cd3bc216b1b0a8da16bf16662a3b36c99"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "25b5f51f9f78420382f255e3bc76ac07e2c931df6c3454e0ea838873e33447ec"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "c261d3b671a98a1734a396e53b8c51d4301ee31e15d49683fe65270894529797"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "bedf5716b6bd808a31490b3480adbbd2e602d4a7a4b4287a90c7f87718ca27e9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "aa8e001b6601cecad5fb8d594717c900bee618490abe08f4d6b59207479ab124"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "1dabe2e3943a189fb10249ddfc93c5e50c935827f9173f584a623c851066d281"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "55579805790f38e42c50e2e286cb471c2c54d7f82322ae7bb0d4b95b08deebef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "4a4b628304430e95906f6952509d245211e141b358d3395b76074c37b590bd3c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "9ce6739063a2bab3ec4165cd5a6d62e55ac5fb83594395251e68e465e2f2b215"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "e2f78e6ba979d1db25c316ff787a5891149e34ce5468deea67aa9a914c136a44"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "b2d65b00dbdad693756f4a163fe84a899fe681edf9bd30dcbc3056ac68598504"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "5ee53ad5e167a4a4c0b9ca75fef351cd06959aff06fcb09c4ee2ed9f75d1d7af"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "bcc9637153cc97f22128193705b4e284741d77f18bfd0bb3086bfaa861211eff"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164992, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "978d1b5c94154ba01fb4b1ac21239c448cf3b463f9b7d417456de1a9534d7658"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "afcd5be6fa466320a4fac811bfc6f0d261ccc8022475ab923b04cee7035b2f96"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 193976, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "fcb56249eb7c758047470d9f1e318de3b5917562dd805b67e913837b840fe3d5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148608, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "b9fa4b29db8bf53c9db5662c70ddbc6e3e4bbc138796fc22af5e608451f71242"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "93c48da3edf707b097f710b269b615f06c7843b2945402d9b669271e98860a9e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "a826df86c9a6a2f3b2e0be40f460bd29178502fcf0cfa1e1c3ebe9e3d796b12f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "f7c69e17000dac5571b2929508f4b72ca0cf09487517d772efb18fe47df0f33a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "f969a15b67dc346c0107e581001f6c0450ad041db9ec3c1c92a266d555169044"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "5b0b3cc0e07f122b817dc5ea97f96fd2de1ab2728a8aab0607ef87670394296d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "1f67d922c92a800b8364beefcb45e5719980180231f2f83fce56da2914fbe1aa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "c5072db17ac0879a78a81c61d949b7fe1fe40939b2c4318205a557cd2ecbef1d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "4c1710380e72c589a6ac6401e18e10a34bf30851bf0d8bbff1ef4cf93a554a10"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 160176, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "21aacf0f40faf8860b75132848bec1b6716a7086d3130d6ee0f3a65f45c39e47"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "648488daa4ee0dabfe59969831c4c55f319984827bde31ed09a7296d041030d0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "aa994662e0a384e4aafe0de191cf738c8d410b9a9cf06946f2c289c9f7409ee4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "12d05c3bb851f5a0e41131c3cde0dba6d86d38e582c4efd5cd9ab0df49f9ef14"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "7bdefea807cdabb69c5ab5be675befbac0f094a30de6ae9aee0d59387696e9c4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "18f880d851c3bcd31b845a12824deb1568962a8bf14fc1c93770b5bd8ce83464"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "a2706962c5eff522614604eabd331801d6934bb6dab79924c2ddaa19dc48f23f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "61ae5aa014ec8e804012b6567eea616a3ea20bcb77b9aaddf9d496102815bdbe"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "420eded92128806d67be8ab2051dcc2d1c9f47b2dbc52efbf9676580f43b091c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "5eeefb77e86b7061df4ca4ea0ba29e2fdd33e0c2d3c362f061f53b9e967813ab"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "3cb360d50b84913e031dbda810b4d6d3ea475f23340591920b7ee1e5e5b727af"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "ff37a355ba19b1d5f8535a13d8575de62c1013dcbe148d32dc2187a3544fb0a4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "4a56f86b498d64a4295695ce41b79d032f84e822640a7416efb4234f192f9b4a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "f66770445aa61d498f96f90471b81bcdd280e4802ace92ae11cf2dd30415f8f8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "1348d7f951c3db09c217b9eba3280250315e5f851cb9a358784541128080a09d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d0e250c8dac1d87af0706bffe3b411ff6fbc131e5c5b7306791d905fd6bd2ab2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197920, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "9a34bbbcce80d54dcb9642c0b12b88917b31e72b961ebdfaeaac487f715f96a5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "61c826d1af6b3e37a7ee5d70d7ab0be7b05e17e25d922ff2627eac0c59692815"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "88a7d2c86a7c5454c121443789e0d93b22237340eb4c9f45fcd79540cad3218d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "583a63ff2f1261deacfb253cb6ce47cef0c0c67389ca35c7a0dbef351fb739b7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "2acfae134a4925b866ba560557ee3ce743a81d5ec1205f8c4506ea8d6aad72f2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 168544, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "a4a46695dcfeef9e34f6840aab6ff04bb7fa2af39310973dfccf8408aaba5f1e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 160176, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "68271175a36ede59938f0e18134fc4d480a635075643f9bd7c4b753f68823102"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165152, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "aac83d9c90f4c8f289a8b3c5ae6afa4bd83e50629b671d338958ae74f11103ca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "c5a110875d5db5ae62168577084c440430ecc653fd3ca2d6d234de653d02b0f1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "359f2e006b02b8ba9b9bd36b558cff6160046cbe18623d5f58dda71955f05a71"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "a6ff82d0b1695f30ac334d73d26291ca3554efe04831288b384eae1e5bff736c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "438fe67ec6c4ac0000bf273732ffd6ffa70db5949d2d25944a2ee3b15f100b4e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164992, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "cfcc51896ab1f0d155ac27358d3d6629787c66647808b6f7cc04b995f6d09354"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "37d0d87caf4bdcab09d398d3582262b79809ece453bdc2137bd7bf7f99611b7e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "119b4e6fdc9aaaf8b49dfecda6a217ab966964558ba0194568be3507c309f5d1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "d15399c2b66d653451c5846351f705c98409fc001ef3f42e2814038a03522815"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "26cf91f3560a8f5e3c6fb55a42f2cd90ca45e554e44f5b23041bdcf1e661e288"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197920, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "8b984d220848a8f8bef805d48bad523caa12b065922621543eacafeed97d712b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "0dca5928912b1b42dbf04b99842aaa5e9913907aac2e19634d27a87b239892ae"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "6c3b8ad4fd30ac8e1ba9f3a3a4af03d0e8554f4067fcd182713acefd2ca198d2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "290609b762fca5977434af23ac737097ff86782d53e49274a822186c53ea0782"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "698b6b9541f786f1626b2fd251a3d0a1969ce2c664f7f2f3231739f1863bc4d2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "5d49a5f05b8e26ba1b09696f11d39a5f7c7bda212358b5991b24090301bb9e3f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "63ed2620fb57f30244b1fd3767aa512f5303f2ca6061799abffe2e69afd02871"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "bcbaff86f50e22d485cee97e0e9ac736660664f5ad36913170181c2d649af6dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "2013c43f4fa6ed555a2a4dcc63de3b1821cf8547c6d60fbc0889b082a68c6c8b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "0c223c97fce3474839a9f5e6e008ace27a1be250a732a2dac6862075dc8b7567"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "626a460dbcc3b30560ab929b4e71ffa4352bc079ae8f6ce470cbfdb3cab67e1c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164992, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "8530d09a6daab3646bcffe9b9cfe85397796d4a10d8e3e6e351fd3a1972294b0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "9f548d7e8944c921353b909e1b0bdd9f25dc037f7634e856c13ab21ca4767a8b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 193976, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "55d6fcd27f4032aece7e28d2ead226b75930303f98f023ce1bbd8ea3dd063071"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148608, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "33eb2736ad602bea906e1716801c3b88b81ca6fbdaa3deb90cabfa0fe299cbd1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "3a1516666005f67714247aff6633aaa09a6e969a65a4565cdf332790311aa334"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "1d93cd247df0882a6836b49d2ee912cc5d8cc93aa8c6aa83ae86a3d6cdd4acd3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "c1985be9cd693d5899d2f763e154067d01c7e5e5f99547cc4a7c26a65f469edc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "85f71c7753bb8b747f760ad256176bab02e20bff18afc4463c4a6fbc1e478907"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "a5d7c0876fcb5cc1b9825c0ce8e38638e0ea7500afe743b4f79c6c8c77f2f511"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "73444c03bd8bfab2fb87b245245dc0ba9b4e64182a3433f74035bfed4a575ade"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "16d714c5dae81bb7b0b14648f5444455d759d9b256496d09591f7832c67b6005"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "f88bcf556f8018008012ad2c5e9c3058ad6ac66b639550c15866fa259da8885e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 160176, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "412d23305f4a1cb6c80cb1a33e850cb5bea06a81e83d58cbd6e0d92c07aff375"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "b150f7cc5beccde35ea91c4a124e98fcb052c6ca4cd5579a8348751756b2aa74"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "c606a8cd1732d34a3409ac825e08efca040e335e368f84832082c47ca9d127cb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "052efff924fef599d3fa4e39e48a118cc05478ea0e46f2e8b650431cb222b29b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "610f62105373875cc312153be90c6cf5e1e8344531ad8c026694f5d737c35673"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "6e13404fecb7788da7802afd671d44f7bf5d37f142d088184b57287ad55d0f3c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "1467a5cfcd8dfdf75fab311558c619eda4e21631c1077bac700840cc01caa27b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "3b8c5c2cfb6c5262f79fa07838d209c0edbd58e1248dccdbc31ddce05dbca697"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "d828d927062fa72bdb9baa6c1856bc025d8deeb1fab598f0c852ca6a17ae92e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "175c36593de01f44c415e145c05be8934aa7d9602e19a276af43455f94bca73f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "412a511c3c4e10071064524e3c59fed9e49d582ca96f155ef23dde665ae9fa91"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "8d58affa37a60b6ba263b348b6ecbe72c0e0c1b016bdd9383f00b879d9f72278"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "d671001d65f9118c4c783e6437aecf5aab89fa88c00d654dba1d3fa283f6230d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "86c8f30890596714fe5f3876892eb893ae84fa4fec55816044bbc99d60e1a2e5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "c139b5413ccde9c0ba0fd2f8519e1d7fa62af78296ab36ff8eda09c7b77d3333"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "b97fc6a380d6ebd1b2b8f1e328b0fdabcfdda8bf0bcea6fc307e82a5413fde49"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197920, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "bd23555b70a075a11f35a3eb8f1f612344b5cb2fb49d5d4f2b6ee940fdc53843"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "87a391d07833a286b24c694a21d546bab211e6b0f67655d538deb422695f5877"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "f7f04bff0a441d61f5c5a73db2328b9e6722c140a573d5a5526cc285eff50946"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "4c0e0f35b778b8b82647d2750ad3bb65e95bc1725b4006914c464f02f34fa859"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "8390ee648b25f19ddd5efe01062a38b0a53d944c35c2d3ab8c1056387d500496"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 168544, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "c5c2fa4042d80552be8724bd6ffa8406799fbce3d21f1801bd3b49781846f110"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 160176, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "8a1f81e4c5a3e3e3fa548c8272d73c7a28ebb650228e6836f15f36bea8c46fe3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165152, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "61b371cd3904446578d97e1dd1127076c5d33168aef3bb36f5d6b05f79b02984"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "7bcf0a65ccbaee40a5325e818eac514499d1318487dd9d5c6e06e7e37876cc5b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "8741407f5f54c167c210558fae1e823ae699b87b84d22a707b3d79371f579f9f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "4ac7db1fc91e05eabed88e11f57c047c7fb74819679d4d574cf831c86b0d3c69"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "4c9d360741449dc322aa9d3bc458c739c549270e9d9de55c89405b069acefdb2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "de9a30f48ed5d64960fbda1ddb557945b71452f7abb8e8215a390484d7469ab0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "0489420be90e063f53b5a8ec572180ddfbab862a60550cdf7eedd90a7c6a695a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "c0d5eba4c1626cd69a71485403c3698d39808e0db7059d7c8c87b777c6510d9a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "77e2a68a34c28d578bfcc205a3bed691df0a282e290356a6d2e46dd2819fd5bd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "646b3b516ab6332bfc49c332448720f8b80418a45aeb61da0352b5c172ce4321"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "dcb8834204be7a828e7ed1cccfe8956b05736109d91e8a983bdb31a878199157"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "68f8b642efb7d107f14bfa425a5a96cd13a99ed31d6ac0b5be4cfab68101e379"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "acbd01f214e4e1ea726bdcd522ea6704df3d75d974396482c12ad1158056abd3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "6ca670c5ebd503050a6018d30cb693f6c5b7af01d0761704dbbb16b22bd97123"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "832c64b177ab20308730447a290889ed20be0ddbb6fa5ccc395ce1e921f455e0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "d819f345b9cca9499c54c746f6da2025a088a430a592a0f9c60a5eb6a67eb8b2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "7bdff91916f7772ef394df1c55bec232135b820e10a338951e1ebefffb97352c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "11310d725d5e8f72b83812b4e3c6897da8cb7491c77562bee3c5e417b92f06fb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "762c6eb581782544f413505158f4318e3deca07673d2c45489016a354a80656e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "9ea31602e20559a579a45bbdc97a257e9ab261ef3d1833592e7161e32a043dde"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "60dff854a97d4008ff2b96e46d0c91ada42e021656f1674b5143d31956e9c9e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197744, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "cd470650868a38e52b8421da252b5b68dc54bd5c7d49a67099728b0056d4261e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "35be2e3044e9b6dd7f077e2163faa866cdba5a404f85b613dcf388bc58051966"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 201656, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "ef132b67dfd7d2564b97a5d83a717fd790e9ffca917351e12f7ab9afebc1fb71"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164976, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "113cbb2caef7eb0594feee923b5de564c4cf7bacca31d125dd9284deedc2c8cc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "687122a14187c01868dcaecffd176eeb82249e785662a1307c4d25265f667683"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "203ae509bfc32215bd61faab697140fb74cc737ccdbcd523632e788a44075e61"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "6ec9aa7ef999c0b59b379619e7d716ecf2f1ab82ac5006dfd0b359a695030b95"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "defa75ffd21b1ff7167c9d203860b8d7ec5b2be8dd536c3243ed143dd52af202"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "f25ab74a338566d2b55c2240efb22ce527ee37d9cf6510cbe4187aede1695e4b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "ebdff7c99b1bf153ecfd9c90d6a02efa8c089518c17ad244f34e2ad480798456"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "3f62a1edbcc18a031c0a8afb831ecf7afa0abc56977192e4d9e8fbfb03bcda05"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "769174ce36c83f58cfcd292a19d39f307ae96d7062885a74f60a3aed8048d45a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168368, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "4ed78a3003d4cecd24d59a8e3e15ea6214821911ed88026bb4c5fcd87382fcd5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "da51bd8781d129e99c0553c020cdbb7b9dc5f53345325f34aa3a772a429430d5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "5c7f6f5208227acc7bd9a78774fa0d42f757787546fb2b69ccdaf3ae9a555a05"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "bd80b46b5b6ed3d20c6349fea7a938f26640b1bb843461cba95dc40c99148516"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "214afcbf7a6b9f30ab40c5276060d06b613e4422067f5b9414a59ececad1833b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "ae625f9fbc7d9ef05e3f4853ad37101610f79f34c249df176856940feddbeb3d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "a3416f2cf2c6bdba12a7984e2cb29e0f645a33eb29d83bbd3a5da2e0927351e6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "50316ac72e51bc7d396218e3859b0de1f76440e9bd6cb71e6a23c8fd79525431"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "61c8a57645000a2384d0011fa074a3f96210a2abb6a5012c6704d7c4c5ec8348"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "965f8490b54ee2e74f62dd9603c7ce28b43d8255529d823062e951b5dcf747d2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "b7f41494da6f8e23628879fb354f0de65d7d1cc6fbbb36f755e1f9c92330e6e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "f949d0b5dd4c8e683ff3bbd84fc42b72437d7018db980d574dfa914a29d4d62f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "6c41cd083389969d3f71bbaab3b2aa26dfde81186e5f538476ecbc76439392d8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "b1c5cdb997a6b96c1b210326600bc7af50b8fcb3150356fc60320bc493837bd2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "fbbdc8d5648db563e3ffd452b8c3ec905dcbea78b374c78c049c8b45a9d4c4da"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "8e7754594e55c4c05f3c49e831218701ae992b4fc9d1063d5dd352cbe7dbb810"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197888, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "91098f8a3dc60b989eec9b473b608f2c8c93f6235f3782ad727080ac78844ec5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "d389fc998eef5f17aaafb9831399e313b59cad7c6d601229b259c796b445373f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "243caff0717bfc9f5eac34b8819da2de3edc44e7cec8c11eab455b667e664393"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "ef68028a82644c11d3b5285a4e818e2970eeb846b3729b57f5da6bc4ed73a43b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "3ca9da204e7769e3f42ceaa114584916737e396f2ec4701da99692ef585c6dc1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176736, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "6fbfe57ba7316857d1018014ff23bacc241059d442434dde7c8888a986c0109b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168368, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "792a7bc1374cf496a9b255cdbb784a103e4ea18ed4ed6f9b428a9cda62399f84"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165120, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "545456f4f0a2d8ecec44234ffe9a9f5606e112a6dc8a6048c9e5894abc072a2d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "b513f73c2e4b50b9ee4f6dff42808892db75325827fcbeab85dec3c18f5c111d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "48dc4ce4771a882187866bc93f4abd6d1cbb1594866828d707fbc95e6c0ba5ea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "aa1c227266f09545def458ebb4780ef2578f254a5926442ed157d4c67cd932a1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "7316d50fd10eb25989d692b6c22aacf98b079964931cd40c61d044072d3fb16e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "c8504292db5cda7bf55b1504dc9767bf7d154a9494d26053a568524a79ddfbb2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "3a2ae7b9af04e854a00004c9f6d6ad17a2ac79b7c57cdd4169ab3ae34ea2b693"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "102e4d446d9dc128c75855205d6b239c3d7872718ce3fd8df7a951e4ab2f751d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "6e238e3c9677c3de0a56d0726981278f05dd725dd2e4a725857f805045ea4594"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "9573b2a20579868520fe620f86e7d45857cd470358891ac636e2dc89fc50bb9d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "fcff88312eeef44ff9bbfcdd0eafcfc93a22a1b90dfa155f0268b8f86ceb8435"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "cf8032896598403dec27374314ae38f1f17bb96d4bc11888af0266a4c3239d96"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "0f5df874f9eed13a7bdeb684fd961b412ea73598ef3a1deea4768b37f857c118"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197744, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "fcd1e40a18ea0c0241104ef2973f29a2aab7110f8c082c8e66774cd21d7e201c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "02d9dc3196d16a59cade0bc6e605b969c0ee153d6a54d0d57db4e4bb650363c5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 201656, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "02108087cdd1de775278f8408207843335ae177f1e857f799df2a01c18c871c5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164976, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "e7387ef6bb09142838cbd6f70c9027e1417dccb55516f7a00565609f9c9db525"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "1e94384c41b98bfcd28558aa6287184f31cbafbe9765cf3cdfec92f73a68345b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "f5e422f846d44d42d6823723e701a4b3b403b2ec6e22e741160d471141da5911"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "58dd39337721dfd3a2f2c061052cfc8a0e4ef3cc47b93bad2c7557f552880d42"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "4118f76bbcb2a040f75c40ca0310aecbfc4b9f74685d165c40b17ad6af86e131"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "dc65ec8e670144eab0e73c073f48a9bc2b8278992552401f9ce31f0ed79d542f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "8ccf80326c846f3f845b639ab0969632919d5d13f8c1b0b3d22a66a377e1605e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "e0f102d7f8d89fbbc6f3945f5f819cbf13d7159eab6456d2b0c5319792090136"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "0bcbb25626caf755a89bd2dc9a9901c78b78eb477e8d5c57004d1b216f336b52"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168368, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "471e5678bcf2ef47a56e16b4eae5233c1c87ff15b20d60160885a080483590a8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "6f5fe9413944013ec98e59b848d03e1c78ec864264846e003ed5f39ceb872407"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "31f172e76e6b1046ca06cc31b2592b468d432795cd9d9b0b60629d286219db26"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "92addec1c3476a44a4c3888e33db682364dad42cea06dc9b00c3ac7c6ebeb693"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "4b05c5440571cb118a773607e7037953d78a913a7db2291cc732f031a5ed599f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "5797334ccdfc3e87257a4bc192057d5458e563494f5f677737cc7b463c9d9ae1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "b908add14809e2e770984884856a1f7234d8479962d3fa49adfa3140c94171af"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "37e92e70ceebeef4eb447c580c49b99949a498b14cb2806d112b2c67961fa6a8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "587075d25281adb8d4ec131bc6655a703bed31f5d6d2ace8e8be09f71ca70859"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "a06066e01aeb2c207b6cb9923464a33f6d54f61454dcc99cbb13da87e1d6c015"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "11636c1122dfcea308e3d6a6006eaba4a0ba85915091d5b1fa2f81925b6585f0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "61ca0635a5edb9f975dca5828a6c48a751cfa846d9e2e9039d253b4d9ec6cc23"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "ba8cf2e40ad374fb518de79132d8caf7a05eb5eaafa2853353035c640cc3db78"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "8e2f5c64f03cacd3e60bea5546ab58071ceb55a9323b5155881eeaf3a24679fa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "07cbb644121567f9af57048c5eb08f9f0b8f0cf8659021a459e26a74af88f4a5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "5e7c72fcb6033126591a3ec87dfd16bd78ec30dbe9789f4061398721c67af637"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197888, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "af380a5166ec6fbb276bba610169343b1d2094c5bc81de0e657b614a19fa3b27"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "24e665688eb3cbc44fe4a0394a606fed8c62cd09b21033464b4babef955a2e5a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "560ea5eac73210801cee0d7df65eb03ef87ff49c2405f4b991b6f6b071a28986"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "bb545ac6e11c1f23ae436aa175f48735fac1e210251f73cf399aea587b63be1a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "13195ba1c66f5738c3a7fc476a800011693742f43e0d960f990fb199a1dddc0d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176736, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "078bf9476c281a6200b8e974095988bc010b4241f982422505a442dde3f4fe4f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168368, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "780069c8c552b17647059ab696cd2c0b4a4c6bea9906d1b9daf8fcb3c6d3bc9f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165120, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "c8736aabe5fc37878757761f32cae908f180a9927ef5a7baf7988bd62f808539"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "987bdff6bdfa408084aac464d7445bf9b04ccc28a11ab2e81006f8692d553ab3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "b7e2997814bc88c3f202b201ad19e8852440688091d1848d4ba4e04e286cb145"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "20f47e34cf68dc4cfc2abcdcc507d2efc0d41d5460a721fe8a03b2336c5c672d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "2b8396f5f785c2c05540a1637e9296a888e5806e3ad1db4ea1239ad8232480d3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "d9a69ffd6d4a8c9c68a28b1fe619057ee8a2362f32565df947230bdaf4783eaf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "0deacf92867ba86f53f8e0b925da11081e8d26d0beaeb3de8a0e2610585b27e8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "548d6ece2fa576722e748f213d07171523808c530957826c64a03a317db0f9a5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "338cea5812d296f5b5a3d273c0a5eb7fcbc295a7672225c12adff2ef386c45cd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "1276ee9bb88e378e1c5bfdcaebce3eeae99cf398331b108a77f95b533d57c74b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "7b0283d481cbf5cf3c79a5bba920ec04d60c9528b49091dcb6639d8189b9c1c3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "edd2b3412d0e63b27aca9db5b660fe9ebef13130c05dd22b518aacbfefa41bd8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "4b05992683beca003edb433351f90f98e2b5224bfc5f4b48e4f317dd4bd4a035"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "60bb3e635a2c9e34c1483043fd645a5a0a7c4cd2f20c7822eb516e46260ebe24"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "5157e996da554450b83e964eeafefca40dc00f1618419832fdb67483a06ed9ff"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "2a5398e9f1c76df4389aaf4a8f5298a3d9c1db67e408b79b22cc27a98ea80b8e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "93b5ed6e6b8618dfa99271fe8bb3af9b3b3609f5098b18641d512800729726e3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "d10c5d0a2f9f7de1308d8e4b8a4f4fee4e8d53407fb6f7dc0a8e2a5e40f8eec8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 210104, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "ae540277f8a407851e237f065468126ecbc76952951ff1987a22aad3fe9f7d6f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "a0bc82da5bdb0ce1afd74b4e4139def0bf696509d15d56880b854e97659e1387"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "f0f78887e7698eb285061201198ce772816cc88461e6c203424396515d0b5815"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "3ad6fede0181faced90c4995c3081e20167f96ffb6635fa20aa1bfa29428102a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "a9988ffa50128a720cc949d538b974435994949fce7775e96fa43d99d9a0d8ca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 207368, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "e2fec8df2c00c606cf7e84f371a2d4298902617fc816d3b157921551e9e5c9b7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156880, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "46dc8398b993a77653e314df79bac602291064f8c4d3301f865b031b213cffbf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "3d658b3a8216c5ec266a17a9a6a008942667efd29f39e55c7a9c370bff8b99a5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "63124dc16db55f09d0182a3dcdb11e82b8b12534b4a353021901f18d7b8c1cc3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "857ae2aa80cfd352b4cc809cc1df432601a16cf99fc5d5583163784dca8a7641"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "66d822ca319d309da5da72f05ae4c05d36d75b67f0ad5c9138f6fd14ce4e04a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "c457c831310564cd037f22ba13002cbb88900cae8a689bab1e5eff265e444d8b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "38c7e65af7348fa34574045d8f2de985cb56af71b40356885018c7ec21b26a16"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "a1727e677d8e94d9c18716364c02e6d5b2519d297ff3474584106782dabfd906"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "c9fa76b5fc67b475f889a644c89113775677d0b222cc4d4975fd0ef36f112d93"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 172544, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "c62a707c7dc777e18f0e4f286e3b4f53653bb5df18f3d269a6bb9136f482482f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "32d74db00db1facb0d588e39680fec4d60d4a3fa2887fe10107331e349e3114c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "9d388177f73dd6f3f109164bd916e5d2186e07a2490ad67c5589272efb880962"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "dc0147fa65c33f984ccf3c57e3e4266d229dad8b32caa9a84bf9ff5c5e9731a1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "3bc7e6090b54e16e8a3597bd34ddd1eb92d5d50db2e1da130369c25f81de5741"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "ec98dc4977d8b8050dc86069f96e6caf0e7e65f407217f001bb7c70b8dad088a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "ee56f4d59a0628c64ff36dea29bed76e5c8a966ac104cfb5394e884c5bf657c9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "300c9dfdeb7a9d83ba1b10cd66b42d44198e4f01e395592f630076fd65233911"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "67c116766aabe1ca595d56cf8f315923976bcd9a3cdfe26ac2f8f3baa3d41f7f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "9479b0afafb1bb26d2378ce51da619f39d19162c5cfb1b0da13022f93d14422c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "5e94b0e9d87e8885eb923f53b7633a3fabff9bea3e1fb60ca4882d999c071307"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "d4e6b10ca5fcd10ebc6abfdb8dee159020c2fead413ab886c4c0b9bd6ea545af"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "5714353b400d68319bd845d63d7072dc6d9aa25420e88f833801a73de2c5f737"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "a978ab88595d90a406569e5bb67949cbcf587a4bbbf6a67cf48b9a89586eaacc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "2918fb8526e5cd2383fcdeeaf0e54899b16e646fdfd302ff8063d373af3d0bff"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "9a41014107bcfb32eb560c880ada71259b0602e3671799269d3fc944ebf6683a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "10ef873298e6116e3163e44598e5f5edcb02caebd36b74dcebe1d1c55be0fd07"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "9379a441c221bc23e68f256a4cf8f7c275453dbde9fa956e5da4e01806df0e03"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "d683b19673078a08661928c97bf4bfae33ca22a6ee2e60b3fb612772e78d7553"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "da2dc02f2ea72bc9702750fb12047c5925e9d5efdd95933c4d4082da9cc0170c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "df4a604e532d6259da1e264ebc429bc0cdd752ddcc9622cb86f9dbc9b75c858a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176816, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "3c699d8d53ca9ad1cde87697d32f7b0a6257b7ab6fe0d488099bca26dc45dbc9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 172544, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "528932cd746080ed10e97fb9f98e8e9163456d2b42781b36eb67088a7dad84a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165232, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "32fde9b3d736614a641eb4aa1ba564e80626ce5297735a9baa44457da114934f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "dc43b183588f8420f4dc2f5bb40064fa6bb98cf5cdbfe93f2fde428dabac6151"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "440b38a4f2eeebef05719c0999f52548a938c0767b9ca66c58c8986881251111"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "2469c72e6ad723964a1cbcc84df9e76cb22a6270bbed98cf736ecde6a23faed4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "36739f88c77cc4adb4413980ad1af3f90ab9b1a7b16e57850826f2aad57ed3bf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "489e383bfcb470db462e5e6c36f2b85344c875b30e720035d47fc0c9b5c9bf79"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "08061e9d25c042ebc0ac55e11fa2b1e4685c99652c870edf1d1bdf76912c5ea8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "d26e5b8ea5f1c1c92dab0f263d5ce4d815ffcc825c9e1d352ebd9b3d873ae2db"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "9c1b7dacf49d9f0326461fef9f6d9ec8f4392759bf01b639fc92dd8063a92e7f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "18e3183d15cb496fbeb4feffed2ecbddb02a629f837f9897d503345325a49807"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "cb8b34b1829084b6cf40f7130c426a3de2e937ad6aebbb409ddfec97e6ddfbef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "8c0b6573f785c2c6ad13d9924d98d2ae82a4e029bed78ff00228e2981f9c8fe9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "100781a86fac589bee61a6e9f7fc677424ad93314a95043da8c83eeb05edef28"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "3b02526688ad214a6763cb2a4f150be0a833828e1afcb1ff84658949d6822f7f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "c2dffe539414ecd48eee6c44b13df3d4baddb5b564f2dd7b6c260d644b545c0c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "a9300e4eee232b92ef1d2b6db7f0053156ba11afb5f0e887b89a81ff356b3f88"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "d9ef42bfd8af5339e0ca63f448adc86a47b05ba9fb6e4f29141406d713b4bec3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "27300efd440853563be5ff656976b74d091ea6b1ae32eccc1ab1fea30f5d6210"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 210104, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "61d35626693ead80fde1194795a1092555ee27f63341d9efca389e736acbbbd2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "3d207e8ee65d02642315d01962e6e572bf2ac622db492da58941dd5a028e5e7f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "1242103bc0d27bbaa73a5552e806985c0f683c4575e49b26afacf48dc31768b9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "18ca47d6e51ed3c4d916d379f7f5d47c6f38ffe50e11ab628c6233835dee6e76"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "6c41371144ce4413890e8a51d093e71365cc6e5d63141d202cc5a956159819db"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 207368, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "6bce54c066fc25e348e246d5802b60fe4e1d09d370402b41ff23053b9822f136"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156880, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "999d3f303315c1ce8b068a0e7b96f5240c68f32190cf2478a6bc643035b9218b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "510fa8484e181d60d00c532d48fb1311e3f841cf52ecc3afc019016fc00303d7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "78ef48424f451bb12dcbf83e211c6f3ac5be1b917fbc03c98f6d138cc1119d5a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "d1ca470a01e520dadbfbcd0e36c8a7a6b2064815394df30b0c4d77e3709dcf20"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "a73fb7698c1621fa520a464d6ebca2df1bf53394cc501acc6b58d46e6655fc5e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "357b16c47482075143398f57f1eaf8f0f4819f252f54d20adde843c2d82435fc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "903cf6d23314db76cc011a9c17067f63c81d862243783cbefdabb704aa87bb58"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "9324333970cd1ccfaf3e0ffa02858ee75095efec88a7d5952c66c74125f61c8b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "b720b2c0ef35e43c02e666100696c9bee3a52f98626425a6f4b56c5426c9b2e8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 172544, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "f5868a78d9a42f774e836f25195cbe9bb76d3b6e669c4b0238609c3b51e1c05d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "8ee0de260304336040f26886683b1e80ce9d2ed248324526239e60b38429ec4a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "1cf48dc30b3a6207c7190c575a41ee9f567585f5c3bf186f4693ff95aeab0364"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "0cd2e589cef48a76fff9c1ee63ce4b295c7405bc5f56cd8f50394a15b9000f5a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "09a14ae6b0dabd1df7561a7850aceeb22fe86af4cfff755972314123a9cea489"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "8094aa79a0a8a90ad06ed70e373b94db960b69313f8b7155f9c8a316005847db"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "47c6182903df34b43d7774602b81e962952c7c5d017abe7da14fb972c8f58f73"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d9e58a22649bcef944669c68de7866ddb703924d888f1be797fab74ecbea2d6f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "aa2b838ea5c7eb8ede10c8293854e360f120c65d10ea42e04f04bd34ed3102bd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "2965fe81d03cd49045419c9738e35ec44a8601bf4b8837aeb34f9867ccbc00cf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "6dc74b0051977f25c82c0c15ccb8afe3e87c7f18c2cd0adf61ae7cb9bd97cef5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "7dcea40185742802620c62e459abdd622d9c0de1f05194b925ccfa77ea6aa0b8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "ea6cfde0e0c69a0e513c53c4b830189982d0ae5d0524a121de6e471edebe3fc0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "6efd3adc203519f72e550240376ffd06a5d2a66bd2a681d7176b09162bc8b79c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "d4f6dd7a3ca886efa89b7392f17cbfab50dc3b095204707c7b12099ac7420f83"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "a791b5cc667caa922127d284c7bc059bdae7dec283b67a95a2344e16df6567b2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "ea9c055dd8d6faf8ffea260008c04935a318770dd4c1bceeb90a241c515f430a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "af0d58107a2827441e7a111a716caa6abb2bb6e1da7738a905e1d892c51e8a9b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "668a02de6b2ce93fff7822ce35fb11136bb6d9a970f24e486a8869073e20c817"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "d5771afb993af0354dec0a98df7370662c3d0e9bfd3665af783e3c9396cac697"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "59745e6b0365e3c9dde6aa99d81f3bc64e029af12a625f693dfa3a47cbf60bbc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176816, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "519f7b721bf71f750e3e28870c054744e4f366d2d35b99b49d189e5fc8b67e7e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 172544, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "ecb97efb91656386a3170c92e142fc14bc9a7734330cf10b43d5bbd3e0a5109d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165232, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "2a2abc36db9ea306407ee2a9f780e8490befe1156093f8b488d9ce47e0ff3146"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "73f9d58770d337420dbc7c9c00f99dcde41e33f14ad3ae68f70125a894b4e426"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "d606aa166b84306f5212e4da71681a01ff4a5201321a8d5c59f7e031ee071353"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "3a4a1ab50717c34b849557650a8ea35ea603b9baf5f37e37f862470abc754fc2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "0e73ec68c5538113464d97898f838aae8e48bfc86a67d576882183cf3ccd3a8e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "8e3d68698517b3da17b30c42ecc3ed0225b0342b4291d816c36b017a504b04c8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "7873b9465683367280b31cb98cdc8c061dd2bc06550560753596b79a88656641"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "602532d46da5744da7f58ed6f856263ee4e302caa5295459ce7d42797dfb7bba"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "5cd2e0c3a10c23f9a1c06f377190f8ab00c9739a96ee81fa2206ee31f95505d6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "55075e3711fa07a41eddd7740e7f6a0a5e840acd72601d93d7185da6161e7a7f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "793b61e9e766a3eeaaf7b4d24f664ab4c7066056597f4ec70a4c5a1c2c5d6f0e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "91098718f4e39ae33eda011556d0486a739ca4dcc1dbd25984d887795619c31c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "dbf1ed524ed6b90844967e3b280ae6086cea44cc4680aa3011fcb50a96e99358"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "77e09e4b1dd5b0913df9b3c5212201dbee00aa4be7235980bccab0f0612e2d1f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "092b0360496f66faabfa6567e32b3fc8d505cefa993cca0e0f9d33fd055866a8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "19c74e6d296b0ad71bbde1071a95b8b27114a591473cfd4dcc506a406f8eef6d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "966298ab3180ac8aacde613f5ecb518642f75dacafb61a41730fdc2dce4b800d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "d1ff3f25790c05a0b0f65c9d1f2a9ee2baff0d99cfad575ba35846d2eaa552ee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "d536673a66bcb894094a88816b91b2d282721f93bbc5c57ecc39e3743e506fea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "b11446d18bc2ec285ea52c890e70468fd60938e7d45b0741bb5504d1b6229703"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "d00c575dba1f43df694aa87e360baecd48bdc0129781b6e5de96e357f34351bd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "b434a60e51f01b4a5ed4f2fab56ff4a50a4ab965fba3c46407cd4e2d1794a95b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "eff725787467ecc59c7aa9dbfcf8f1365f846fbf3dec0be63e5fc56a7be2974c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "65fe245a9530943dede38d9cef4c4ae0b2f27c5fa3137146bd55f7e2e8da3387"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "3b9e920f88fb501f7005b16507b657c3948beb8bf2b7bed2fcceea925530e0f3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "0ca3333b2d0c2bce40d7c0bf21069f93e82fc3682924aa4cdd4cdb0f229d9ba4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "ebc08c7d331cdf1ece134627090f0ff6680c0364182777365fd1df9b6f9c5791"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "d592777122b00db8ba7daffcdfae958a3995125188bd6320af3a1c748fdc4de7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "a4aaf55ab2e075ea53fff831ca12ef0cf3fd24715a74de9a60274dcd76327b49"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "4c1c2ad596132a4475d85418e1ca950a4ddfe5eac1037470248fb29a64b090d1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "cf72518b3ea9fd25b02836fa8525ec0bf776c73a9b0a509901f37ce93aaa8786"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "4f26a7074a758aaea7f61886f1b3096ab2ed272034f8db3be269568b8c78f182"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 194744, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "6ee4708314a6f8fda12c3f35a0ac5bd2b0854607a8697bf3e6efa04cfbc142b5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 207112, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "54b430df1ea1120ca13fbc194004f9fd575cdbcead0c739aa5548ae415137d5a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 180792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "af6b0ecab1d8bb8433759b490dadc6d39a52600f8d06f19245087c47babe6c1d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 195208, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "23f15b20837f7d94a3e96a29f1151ca41301796c35d1ea2793fba2c03ae29446"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "2510fca29f4d278285bc5f20adc2b09e15d1b108b047e2e3062d1e77f3d825b8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "aeb5fca82c889d765c854e53861e981f59415a35d2f1f336bef8d26176936532"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "c1f7a6e9538b7facb7851732347788ad6c73597dc470ccdcfa8c183c733cf081"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "a1572a8ce94ec5a03ca1bbbe5e624b4f834526f4ff4d232058323cc105a788e6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "cc5828d532cad46d5610620fd0a803dcff02611da09135ef3455fe54eedb261a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "c92c83db2c35e2d8452abdaf5bcf42092c72ec6888c012992a9a0375bcead018"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "fffea884a7df427a6f0e6559c279b124e1c44f69ba2084ede923b53c06759c13"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "257409f08127b1e8747ed1d388eef5d1f28518f6776ec886df36f103606e89a7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "fd2e73095395bec0f3ea0aeecabf4eec58a60b99950bca674d96fb6cf6f52772"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "5788863935ff0be65aaba5645c1d7227d9fd17667dbcfe3885296c1585514f61"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "4abbf182ec4a68aaf9ee46d881e32fd0d4a0c161245586f727fd8e73d48486b1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "c074384bc1709435d16b73103b21be31c5334db7bc5f82bd1187c92cd37f803a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194912, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "1e636f55cfd9833ac8c23a7fd97893fd3ef056ef13ffa73469dbf859ce194898"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "1428b06925dbcb36bab0cf0658e63133192d37ef17cf5ff75321a7aeab5d1f4c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 206064, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "8e8c20d33691aecf2a62ea89dff31a8f924c50b0ee56033cda279fffc9763369"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "a29887ab0fa642d46bae138b846006633cfede5338b60f96d7fb03c2a93d91b2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "d27aaf7954fb4c450e6e10193aa9c7dc35053764fe21574cafc6e15a98910a0c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "f84a9ce3f6a6b2595221b6fe2b4c607aeea05eff1d2186926fd71678d5f125ed"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171872, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "3c9558151e44b5fac1e6cffeb15c9135f2359eba2c13b791853f7bb42a133cbb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "1c55d0dda011088d4b2681948d613d7aa9594156fe9a9c109d42c198f061454e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "6477912ad6b69baf9a32517836d0d6c6cceec2ac27ac24c7cbdfaa8592f2a00d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "4ccc2fd5fd0542efe60f6c48453e916ece90c4cb4815dfcf97afcb2c65b26092"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7d402ddc2306a8f9599fe894b692700a5265a37e4c3023bc93ed1bd8aa3ec582"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "966991caf80d179718f7b9bcc040222c1059b142d2312acc5ed3a5894243c6d1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 206080, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "7883948d5f65335b6f870d37925b63e31cf95602f041e5f0c1d7102570591f96"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "3aa9d52f1e921e8a5ad0e6e84329228463b4ff1bc55b6f37b74101042f652dc8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 149216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "3a19a3cc21c8ffcaa3f5aba8a6772028bb27c066433990634806c608d2e6568c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "852c82792305ef2baf066fbdf4070c88de9ca64b186e67f24e0782ec3772c6ea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163632, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5056ca285814c4db5dbed5b66cff57a92f34d03a768d34c44f0e8df2f4e06727"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "3ba1d35b80878e5cc438299272f81661dac0bd1633ec1160d2d4456e8365761e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198728, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "90fdb8127f744756701df33cff35202626f9c285c4cb8876fe7ee04e67dafe70"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 185032, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "d7828210d81c36318297e97eafa05fc8476bfbde62af43148e4e4a39a2ef1323"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "6c6af397508c2bebfee538200a86d925b9b51ae12626eb524a4115717d3c3984"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "5b5a5eae4dfb8f4b7152ffa785b0c6a822fe5c99a25f5a4cbe3e5b77682cc18e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "f458b067b83b9242074edf073b836b32c80cb3a0c1cc5c0bb5c912792263c936"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "8f580366a7aa3843d4fbdd1c3f215ce4c70a76e72455bb6811795c5176c8d6ae"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153456, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "d60fe24129aec0abefa0565096682e9d3ed70584d94113c827221ef8a0c81004"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "c7bacb945c287bb70056ff0f9a23e1adb4e170c11b7fe5a072e40dd0759c7859"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "c50d967885afbec7d878a312e90b49feef2a12ca332cd23f256cb21a62704bcd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "8d9fa3d146fe9dbb0cea2a94043ef1233ec9d0976f07b6eae03bdb480536c65d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "33095515fbe99d1bd26d1ddd69a7a719533621b4c7385342e7cf20de8a42381c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "b46081252db5931088131c366792d551d418d966e597c1579ed98c7956d90b58"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 194232, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "94f0c7c15fe6dd4541213af70d62ee86ce522050be941ab6d5c0489c3792fd91"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 206600, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "f91cdbad30dc73490d8443a5104a4466ad2a470eb0521c42cbd5582a3a8b9bb6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 180280, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "788a1e3e78560ea432f5efa19db87b16718c3421581a8a219f09c8c6560f6573"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 194696, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "0c9133158acbaed6531b08a62309e2ed08d95b2e8018d5a46e6cf55647c4c817"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, false, "4ae5be0f5965d81aed75cfa90a5f9c37c485a131e543f37eced083e2564e64ad"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "14499ff262d4263785503ddcaa6e50191d4fdc253895abd0d487292b19b3a546"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, true, "925894afdc9784a51d381c20d35ea9073631170fffae216f92876e54bc14f7de"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "2dabb99e7d567a0f726b64419c66800d5a95d0fdee52bd9a9e180c98f23615cc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "a919a3256423e5178572459406f001614790cb10486c00cb187514ab8b5eade2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "71158488a5b4059f44a7fc24488818b5bf6d22f724aae70f2ba42c6d24208814"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "a435049844f229a9d990e1a2f08124a584bad3306d4d073851fe68ff1020c09c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "90885072e55ad899cbe7c0797b5c5daa73e690859b9602183ec7f2ea1de6dc4e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "8a8dfd6074ec5b1a6ae722878fb22fe17f603ab6c885ed7e784f49726b7a0123"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "175303f62e0118f3a55bff9df07bfc3b5af5bb41a496a6450b9731cdeb04c21d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "1f1a45ef8048e242abebd51d54cc5bf858fe03bef6e5ce389014d72d17042304"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "861486f6f9939865b01a519c666add43bee9b4f00fb7d0c07f11ef2caec62a63"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "245c7462e0219ac57494cb7e4ec67127d89d065ed1872379f1656e030b4169a7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "627b4fe8179b138bef6880e6d8fe39e4bef9466dfa7573719921e7f4ef1425f4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194912, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "136b4699b7401135ae625b763d57af41ce027e9d26ac7d2280fd272173fd907f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "437e0262eaaf70ac3202bffc91ee9ca4c8e5cf767a243f088ee14092e9192551"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 224040, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, false, "ed97a9c89893f5ed3d5e8333e86f916ab867012701d6b919b62ae549171cb34f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 206064, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "c6ab6194a7a96c09c704abff233c899485b0634df7e95e4bfc3d9ded319de76b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, false, "00f32ce5e6154cf741b6342f079bbd51dfd2434624610028f73ba4c6882b7f47"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "eabbee2d7f718faef846060db74ece187cc06abb1c8ac4beb2f6d3ce70873d81"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "3beee93861423072ab06fa31e52b33a1907a6d1233a68a878ecc0cb713e796e2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "c4532aae84ec9e3efe740234590b2be24c9fa319d97164cefc9b5735ff243c10"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171872, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "b5086d3141402fd1931b163a3c240cde1c3bf12aeeb82559db447b617e14c651"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "0071a063be7588713f0e430466930a3a2451e9d61e0fba0dcb8069de1bd4a02c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "3616d5f945de359df932bcaa81285bcb780537f1d594e66c2043a185cb762837"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "998f1a07e15b6f7fb5cea9d93729a7e3a466fb6b573b6bb85080b3d472ad5e75"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "3206df87b9734400950eb7af4f83b83d5f8866b410d5b864bfa93515ae276240"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "2a9b515b6a8955cd41829f910de29fc683649a23737ee3a7b8867cc4a652f366"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen", 224056, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, true, "3f6319a96b6e2bdecd4ac9c44ea792e8fc7e2dd151587765c00959047a5dda99"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 206080, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "ea9bf820ecbd6e9e715e9204fa75192f25f21dc0b5d442b78cb1158a3c18116d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, true, "854dd96f2fe5099ef3048b8f94ae0d6394e91d1de0e58e635ddd10c31ba7a211"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "b41ddcd33339503eadee7e971dc96dca7d3bf0daaf2f777f1ba73e088f1c17e8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 149216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7945dfd0c4335e9ead7a53a598a4f62ec04f72509130c58eae6362e6bd3b67dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "52c59685267d3ea2dc8aa2de67e70d6177a87ce114aa5062bd25a51124d516bc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163632, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "909a734dff02494819e48b8b2562cff0e3dc2af16c67603fe918daa03ebf7ffa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "82b1ebd498faba3f6e3e722e37ba4356803dd5857e34d1bdb57fa451126facdd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198216, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "5dd242d5a8b713a6fdd71da9052757f7c260ac0f71062730dbbc31356704c5b2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184520, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "76fee688e230f1f53bc9c3ead16a58e964f7af06805e240af00f045e7d94d4d0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229256, 512, 2, 1, 1, 3, 0, 2, true, false, false, true, true, false, "2615cdda13797bc63654c356f50f935720a36a7a10694cb4f1517f25e77b7ccb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "00b7c88c1e6646bac2503a9f171b5f25313f62a9e3b4bdedfc8f5861d14ae2f9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "de7821f4d659a7e160dba7aed7a0dab79822ce789f4f2fc5b320b5cd782de901"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "aff7eaf4f988a1f9a45f1ec24f011225a2888fa4865231485ca6a0e999e1e0a3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "c47b3d38c2c692bb891c637b3982517d7633650b8ae8069417690ebd30ec236d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229416, 512, 2, 1, 1, 3, 1, 0, true, false, false, true, true, false, "8a1faa56598a1b9f14003ff97d479d9310396a7ea77cbf979a43dba0360ec097"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 229240, 512, 2, 1, 1, 3, 0, 0, true, false, false, true, true, false, "06b8538acf9d9d52ff9d620785f800a07beb4c806b96b61c3da0878984864256"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153456, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "071c702dc4bfdb7c28ae079429f087782b768810bb5db923d8974992afcc80ef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "d926299810d7dd0dd78cfc952e1ae714ca743df0746a11bfe4ea000388da2bd4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "d8f5b94ae5711b14e3a68d99d1e29975d99b2533dc5b856f9ad9ffeb2aebedb4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "167b9adb7af7f95c308831df15eb8e439e9f86c265830ac82ce8835c34559254"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "1d629c3f4d44bbe366728f6b916b174cabf64885c8e54bab82413637f640f9c0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "206483e4d0511f36ed5b1a5738f15e178ba92a3107ff49557f99db567ab3f8b0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 193976, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8586b8f6b74bb0fadc06d5cafdb1c4a17df572f11f1c81060fc2f1a409ed6ef9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 206344, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "07d0bc443b40d026d98171fdf0a011e9efa426b09390dec28e4ce85b14efbd0b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 180024, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "98e435a8059379c3c94bbcfd3026a4a334c304e078819d8ceaed2a2bd014ae2c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 194440, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "dd56cd49995cf67f589148208005f04c337fdb3b4db2b6d1e2e9b51f2c1f5dfa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "e4915723c7a54980f69887fe1350d5e59bc5e61a354399f264f15e4b910b41ea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "338db8455bfea5f8a2ca0790fce7ed9ec4071d2ed3d51950284d7cdf1aa078ef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "3bc20352af7b75e6773c5c860051a8bc4192b75198e57988b4c2dcb077613ca3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "94fbbcb0b24ac994faf1387adbf805ddd104a051ef8c018f84a1988e8ee7a24b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "ad968d2c5c5eb16a4644799d7757a0a6343b77d9f6f73157a4d5906979822079"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "f905fff2c69ffe9707da9fc5a9a085b39c56f19697eb9ba726c19adba98af8f6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "082efae77c7cc6030f15b68db96b2debce8ecc3dae4d4e43780a3ca7d2c4b95d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "8b1c4b84edf5855be72814bce05e755ecd70d731303cd4179c216bf988c11a42"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "77c59868e26cb096ecd3bebcb1af40b4c4d8f9e8c2eab184837e97a2d538ed03"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "24de828486b26aef87516ecd3fa70a876e20320f24fe5b55ad3c75ee2340c778"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "2022ca22e8c574c184e079f8379e4912ebfd8bdfbada5f0480e4b1ee5663ffdf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "5d5f81053ec78b6f78b001b34519bd90cd41a5343ded713657aaeae9e0bea256"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194912, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "3e6dc39ff245ef6fa9979aa5ed02e9ff32081c1ccf11bda8c568012a25bd9662"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "7e28a637a3eceb6e1f0767fdc6bea76aed7ec0f1a6ab0d9400016280a45a5cde"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 206064, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "41448e35fdc99814e29827077f8789a370805be52c94d8831f16b0fbac4dc936"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "e5f984b007bb511fc2fc8dc315cf94b2e5bda07dfa0b12ee43cb3c42cb5ac0ee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5b596260a71400317fee4a2b5fd594f4ede8c0f3457005ff51408b408a3d8c6a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "da4b7b6578339661a6b39cb1bd7c8d44d15be3dc8bb7e73bdf01fc353fb78f8f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171872, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "2c80574173ef052b27741931f2bf13ee3622b94d64181dc148225cee41907ce1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "b0e49cf5ed80c532d93296a71f1a576f2f105687bdb4726cb1e06bfff3bedd3e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5ebb040e339a0cadf8be32454fb671c20950010e0fbb42e82ab2ef2bbb69d780"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "aa6958eba7867e068fa0acb1beea486d92457a63ffce0e4de3a781736b35f0f6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "37950e978d4448c6f3e21a197be4798468f182f33e1a2385ecc4566c6c920eb1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "41749feefe7f410c90cd585791e7cc4b4e05fe92ef2476cf8b9fffe06b100f79"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 206080, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "63cdf4dfa9121c10276f17a5d8e4329159b384fc34252d1386765eeee890a5a3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "87f936065dd8d1d1f5a6af6d14f7ffa6cddb51cb42327a8d1c39ccc7b092840b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 149216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "15211cb466225849807ce3233974b8180ef9724686e65bf362eae0508c722ca4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "49cafa6e129116c824d42467bc9e8740f7e0e007fe4087e16829f3de1dd2d466"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163632, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "d2d31adb3ef2a9fb9f40a1817009541bfd16700df95aef802bf2eca40e9e6f35"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "74e2cfa49b3453e5107d60a9e45a010f124bbeaa1c694628a6f39491583bceff"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197960, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "1e371dbf99014c8355c79935ee9d557f093d7a7022372634d700ce72c487a894"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184264, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "ed5cd8f1f4f60a071f0026377aab5c0eff5c5259358cb238e57cad7c9979f5f7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "86a1416b5da0b5a5e9915b618b3fd36aeef796d4808849ad6a5d5117842ed3da"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "7ed4290ff95ccaf7e1599604ea9c115d0f7c54ba5faeb2d4b630b38cb5619fb4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "66b4b2c94f318d2c9d8cffd3821d3a1a7917cede6d5561a14308bd2ad33bd499"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "cd36e7e2217becbd9cc886a77683bf0831c4675ccb2e01c35431c7fa96d7c407"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153456, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "b40b3087a7b1d3f8a574d6e7e870edf7d37ae0e01984c2ff4ad1dbacb6615bd5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "c210df10723b2700b3cc2dfeb41248eda076441d83e827ed865538544963c788"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "6050709896852f717bec6010e1d4ba7f7a57816507b4fd450c14b432a7764aed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "38286e29b03b1a3c26421c1957f788b48727b7b0f271cf222317f7426160b1d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "8cb74fef1f2235183211b902d070c1d18846bc4a6b07b8a8ac86dc2464aa13e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "e0e3e7a2f58809fe396fb84c3c1ea3e1a50a397f81834109d0c218bdf1dacf80"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "a35c54192731dfff9fec65f7938771c49c9d63d72d4dc0e29bb8b20dacd00017"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "3fb95653178439069b1c2e2567e49b9bdaf22c2eb3f25e709ec9d2ec06c6cfd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "6822b7dbe2b36e1990677e19260efbbf3effc570d9629ba1a876a9dfbc129d17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "53023f60b2f323ccb02b91e2476ba3232c0a69b03bc565adac0cd0f83b6a3642"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "3315ee50f2a5c35725715abc5d44a81308e3372c7478203d196d50cb7e1e1d30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "2e084075aa5ae491898811eef5fdfbbb8d88aeb897da8e5a982a55cf0fd2b1bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "595ade11edd30b522b7db9f43d7c726698b4a301ddddbe1b5790b537a2d46a36"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "1e17b2ae665db8fadac5420b12c1e9448e4f47d4e385dd48d58796002a7acb72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "0b92212e11b9e4eacc6adbf4f60a2dd0f612310c7d4318d7c1cdb5e8508832c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "60d219940b5f4097275f6669c2f32c370d89e078f72cf8ae4744b964407ff2a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "76862628dfd0aefd389d2ac864633bcb86a52ca38f769e2532c222203641b072"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "180a006a50bda8d4faf8a1d657819505256ea36ad676bf7642ff94d14000f69e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "c40e34e4c6ac8b7a911234dd7a0b7bff13c649fdafd31f331647a2b16c50f006"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "4cb2fe1b3067fed37060da1e1a090ce3f67b01b042768cdf6072659d160b5b4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "5f111c5172bc9583186cf7caa592255b6044a5cfc9dd1dd38c99eb3b39ad1bce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 202264, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "0e648d88d3961abe66c013cff16de3b4d5b8ae42b6e85c3a61150f2dee728b87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156880, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "76afe3eba04116971841b317e1ed0c2a07144bbd3c3c295b5ab7c85062a5f48a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "36593e37bc33f612ffabbdc9f57da83117417a9be3ead7976c422dd8e7d1d8a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "29def9bbfc64c71e60d8c325428e547bbb6468442d6e0d39f88b91a89bf11cf4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "3191dfe8155b741bd8d2fc5a0f9f2d0339dbb2d590dea6e19c737be557c6d132"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "94594243f4e7dca9373ac84c854acbfb246a58a3e030c4910afc0eec99f3ff30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "96231231aaeed68da95c75579637607f4911c3dd3b7d9edf0ab2df0cb9126047"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "0ed52fbe4516ea66b67126dce3515da7648dcb23cb958eb52c793fec3e2901cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "f8436cbc08eea510cd8608c38f4547a0dc442925528424d6c842e7a2f9d9ceeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "029a280a70f8db56c1b0b6c4a254694878c70bd94ed8254b2a92a57d99ffd2ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "583b58a4d4b2cd3f06ce0af4a078bb2b2a21200d35737a4c0c3029f8f53157e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "8aae73203632f81129e91f6b44fc48a978998242ebb07965fbe2e90a6dd72c23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "7b759f6fb36b340296a9819d7e70e46ab2771d51f462dd12da0aec80f0fb995e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "6bda47ed6e36b5f2c1fb2ae836a20c97c9c7f3d021173cc515974847c13b1ca4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "63fd3bcc0b94923d61c1a2ba3a0b0a938f3d1d20403350f9a173c4dc0723c3ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "24f8d4b0d13023947e661da1e389a9a969ff5289dff93c3dbcbfde2a731526d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "ea736cf8e2facb1f9ecacf6f16fe5e12d397bff9bf81d0e14239775c744652e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "6a971da231a61d2419acc425c4cd40008180b0702b35a31faf40e2968b857c88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "e4ae93331f07e84a5aeba5e043d8904c22689fc7ff4fba51bb9ca326f39f8ae6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "7c0bcca138845dec15bc59472f6d7bd37db1a20ab0d4311adae40feb3af36791"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "d70a9a9c88da3155bffcf28e9816a88daf9f64b7d40f6a3bd02dce361f9b7898"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "438c09988516945701174d74c17b69659e952043ce362695020152d966b972f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "a933602d4adfb09c5b9983605137cdf3bb8fc968fc374f34a72b3b1e96263491"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "799bb82bd507062df966332f78b8758dfde6a145895c20816133e425a1497b2f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "2cb7078591c6aafb32bf425f6b6be3029f4c64b420fb7c02aea2662bf7d0e86c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "f297f48456b3482e5a48dfb626cdcca2e46fe9a442ddc9ceef06f103ba0e12d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "bc1451f949e3a64eeab4f1085f4711bed300660b732ffdb6f620cf7a0773804d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "e0cee986b109e4e15f20e6ee1329bdc43a860659c843d476094b9c52d4355f50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "f836417e63165b57dc2a8c99414c6dc7c049625222b85dfaaae5e3a8f6427114"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "5e26624544a18a39dc5a7c30a31f0561d8941bbc40b90af672e9be49bf36bf90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "0e5f084cfdca5341d283480f3f5dabf3c80a03f0e314012b97a41b5490ccbb18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176832, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "5b5684be3dbb47cd7bc458b3596d06f330eddd15d8e924f17b2b888a351ff0bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "e837988c94e2b5ea019b2d8eddc1223849b1da3985ad3b7c43e38e577f22418b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165232, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "dd651a88f5c1a29adff250602917fb61594ba1ffc9503e0396ea5e9de63ac92d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "f8bbf5b1bce19678c4d514ee09d0e53da49b8ff3a357f9e5583a878b2a492a66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "8d69ab9ce684fe298f70e92afe52fd2c7adfb56d44e7e46303f67fcd1f98e2eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "ae9cb72d1119330a2f77341aaeb01e3ea4ef63b756c57bbce0e304ab028bc127"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "059138c6795c2202fc5f0927a9eeeff14778ec321e499637e41548beadd9e943"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "93b7513e3f9db75ae7cd0a320311db5c5324a0cb141eb65dc9554643fc47857c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "e1d509bccfd305d47c6dd258bcdea8e5033a07387468c713cc7e28ee004519b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "ea5083927172a693b17a8dbdd6cbc75cc6d05e38c209be179f7e1280a3f89271"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "64c78b5ed750ef8975931accc76162736e5b08b650ff0f1287f7ceedaedb72c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "9c5afb844b9a511967e1d68f86c0f769d4bc9deea025d6aeec5d43189ffe7b69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "64316a0ab9b2583251dc50b5299f8f5d45ea44e150a9b070a581f1c751460daf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "f5003ae97ccb30fb5eeccc9a1812b2582d73fdab79fdea279dfe74f25a529db4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "076924e70ce496e905d95b32d1da7ee940dc94e464c51857424af1e1284724c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "5af653c8a49a4d395e960446a4bb047ea0b197033992ac3261d3aaf592d6ebbe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "501c6eec0c0c730389385973cd730a89f0ccd56e3bca7774b5868ab4baf7fd2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "24d961a5a643e9b55310e356b699a8560b87a0e846dc1828c11e30d26cbc08bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "97d79500ab8837a125765c85f9749a9d1009e4ad3060dbc04039dea4408b33f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "155604ab7231cfe6a744a97eefcced25992e99b2ef5b7e9bce69ec70ef648c4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "9e26539abd089145a691a67c9a0159ea79a12dfd5fbb3d1b979bd284052a4e29"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "98c1848fc3644f31a72afd47434d12adb52ccf7e956f178c76eae045e381e5c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "eed6bc1deaea3cf40d85e02b3937e4a1cbfc3486e41c769784d0536b475995d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "c2e80d6eec269b22a551839356aff7ccfeae6d0e07511d6e477edd0db4515043"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "0d5beb8678670cc4a60233d2bcbea4754b67fed864073032afc14d639a9fac4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 202264, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "eb0c4373f7b701d82e3f2fd6177e955416bf6b25fee593da94268ddd9fa6783a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156880, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "a8e938a3bac242fd184d28762592a97a589902f9a73ef37934d1791c7cd1223a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "e0bdc38a182b2d43c9bc5c1ae110cc061f47971a8710dc45ce8d371c137b3453"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "8cd0fe21733aee9fe830911c667b155e834aefaf563103dca24e3a4668447cc9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "00d67529d6834fb50fc5ef33e7b6d3fa9b0fb1b8dbb1034681e6a34e282bbfb5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "a4a197f217c463ed659baab4ef3e560172ef31e3543f2eb8211d1c6b73be057d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "a4af1f70956058094075dd878ff77f4cd169c639e6c58422e5369a2ec888372e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "ffee1b2ee849b6e2e2af3bc4ddb960ba89bca1dd0c3a4c78b7e47c305f4e5c8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "0d4584c527860c06aa94a8412ae36ae27cb2d6d8c31d56dc4de29de4a5b86075"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "9f4b159b7e66761813dbef79ff62c69fb4e0582c685ee7878096450b18886c8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "42dc9e8300f757054d3af902c22d15b3606964ad8fcc79b61966c1f7329c2887"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "b3d80b2be7894ebb9a02a5b2742945cc70b007dbcb499de5a6677121d9f29978"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "295fd3d3717b024071d9af34bbfd0081a972e6e6e09eb4b35444929dc9051c72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "d0c75f820461f4c11968aee519dbef058264b0eb25d901a77e4cedabd6085782"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "42fdaca17c410019d797bbdbf4e76abf3a9d397b6e4ab16edd095f7524f44d9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "2e5ebd1b593cb9210387798e695cf06389fe6d41fac015b79f23640fd3fa667a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "9aded1313160d2e0900bac1bf401067c5a3424dd95cc717d75b72289d5485803"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "179e154d6debe9c11271e76e6f1005fef8025385eb36a56c5ae5bcdd72fdb016"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "1ac4fd143938c5822c7bcbd8566084d91d1b54a398e7db615873046ec1c18a0e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "a2442368ce9f18527644ecd53f1dc45793db49151664419303b768aec50d05d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "c52ba4bdbbb7f02b546108c1ed92a69d06e8bf730e444e6f026e0d0c7f7c1596"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "c81ba7828a4dca427f949baadff534921077e5c1af059e4bae7ce362b1ec8962"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "def7ca9936a79303e7fffdb24478af9277e7de0e5cda7d70dfc0d6a8c95abb3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f10e910876b696c9ff0cdc7f7d29dcf851cae9e08eb894f22978c23b79f810f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "de784542548f81996f7e440ef7f5867a141e38c5e2d8d07094c693537c68f551"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "1e5724179c5b4c8f83e20985d7cfa42c8d3e6707cac95116ca6d67daf2755dff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "5c3d951341003f2dd2a6a02e7be47540a6de78942fa58012cc271c963116f69d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "3594f89a9eeb188532127e43b9b2467e9923546bf7ac36e1ed175f6ea20ecc03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "ebd1608c49d019ea601394df480ddd508d9c269b4bd8b9954df1615aa88d5b54"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "a72595e5a846374f8f2fe58f3b78a8c4930543e3cfa08d607ec856408812b92d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "01c942f416a944168d9a0ba0c5be63a6c8feb6ceb8e7867beacc3751959e9813"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176832, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "f7b636ac058a9831769533a253bea695bb6fffee311cbe0cd26bdeb47a41af21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "0e552789a5ecdbabe39c7d397b6980816bc1e5e523a33207ea3a4db8a0756d6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165232, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "92e8bfc0c570513568e277cbb5477411628c439d64a059f51ec6cebd4bad1f51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "453123975221d8e4994fe17af395649cf11bd5152f842fbb593748fa0f18b85e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "c719afc6c1bacd9596ce4d1c08495dcc1aa404c4a313c3a566098e935505dfe4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "427b40e6dbac8fa0253a659f8e24dd479eb2dc3a80e8798a79941b332ac7757a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "63a99c53d79cec62d8fe593123602d6d7957f7d6affbd8638efa56f4dea06acb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "68f012d2286e0f8031e0e71e5a2e1beb1d44853948758e8a8ddf77ba99b3b337"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "0afc9162419c7577eb8dc95114eaa2cdb24881661e3cfccd1b1f40cdec3f620b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "5d7c2c451abd74f939c9f7272ec06493d93d6b3e649aa54585cfd21b62b16d03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "5d6c73e720cfcd4c5e3c1185fabf1f5c28eb3576963bff4d99389c3178add8e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "12721b66f277e593cefdc7c9ef4a8689063728335701369234d0498e46dfad06"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "3f490241d21905bb11360bfc89e072e0b9026ee0593fca880235b822d74391c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "70bd8bc0cdd26bdb294242ceb6f541b3669efb52e7415063bd6a3db782799b09"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "f1cfccb02ac6e1a6884a8acb416c9fb5cc8c1fee78fad388a60a28bc5ff572dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "8f77846dfe2ac566c89026e5d3ebb81bafa170e9119d9dfc37dee34e53e50796"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "5d762b1dd5ca3a56f004b6b11e12ecf761b8685d0bf3fcde03f18cffe782055c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "b2ed4a88c99a621606ed8d4337d93ab85a4b79b017469e0b4a51f6bf3af1ca1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "30133578285a3ce092972e568382ca680ca28fb50b8f41554c38550ee0d794e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "af2e4fbea45101965ecebfb4967708e1eca7d098ceedc960faf6f6f390ec13db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "3e4641c82b1a76401ab63de5bacdea4c24723125c59f186273e2d8b11b35b53f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "1f5000f7b400acb61d7d05e68e72548222882e10aead1838e5a44da3c5e24c78"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "c0b7373fa25ee7906b9ff07222542e73eabd2a63c3ddd45e1e9f6666aa5985f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214224, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "4263630d1063d5b4470e983956922ca54ef1bbc6c7590880fb7467e1a8315efe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "55d0da2be3e34b855f17d8e59985527c242a924f8b2fc970db689ed7fd7606b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 209944, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "b9a5b64d3bb72e01677bee5467dfdda7ed61537efb5ead58e58c9efc328309ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181456, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "cc2cec4e6613828b87ed2d98a2b9584f5a5ca1270c9d76f2cc67aac2e75099b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "822f98ab7ab9086d9a31836675306f7b0e809088495ef459b2cecdab4da146f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "98247ac31eb01028ffce54987e8b0032b8cdceaa5b22a42f29ceb979c99e9aa8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "ca8b6ac2d1dc08d31b9beee316e0727fc591c98235f971a13c502abc22a0a711"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "52bc5158c7e1be4dd72ff0c9b2fe4a18971fa66954ff40a173dbfa387ecf0840"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "76a693d0200b4b88d8260577251ec15ef2946114035f66908150ae9e478c8e81"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "1a06d5a08118f599b7e2361dd53584583aef2ce6c696508095ddc06ff4a3e1e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "bd1f2aaed935044189f8e3995d52e3828eb8c37258a6c30beba2712d51b5c7e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "fd68ef0a038c77acfd455ff8b6d76ce3b33b6f9c12efe6dace5e124b70fdbbc0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "9c23e5ec8bce766fb7c1e2a50a4681c70da3a100df26f3ec2d394422771634d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "45bc6be6afac4aac9b3d1a3de8fafa246d7f82630452bde3eb2cdd166d4961f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "dddaa19a8b3a2fe701537ea24b42ec8d4216fd2d738f87d8194760cdd61ec6b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "8f66895ae11a8fa21ec907624561ae37ca4ba75da90c7fc0a47f571459362669"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "5d78363e0ad8bdfc531e290fa266a0be1595e844e92c6314e4ecf0ea83237b84"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "184da0130e01143ca65d1c2347a4f6c6ce3e78a92399e5b1ae3fdcf8a9a8b00e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "6c56cb09bcef8b2cb1a91cdd3f0bb01d4cedc4a82da4c5b00dbfb1df3cc67641"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "09bd3d9bd4f5d9fbaa029f058bd42c1566db48326b708d888ee2ac4b5854f240"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "5707f00be2c2f2bd0a88e064b3ad272738f16f7303549978410bd854ea550b82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "43f84f28920dfb9ef744fcd86c94176a927215442c650b3e8b724da719c18b54"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "ccf581990b79985d6d50413f5d3afe7405e220b8acf9a717c720b52d3664f350"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "ac69229fb9025ec3f5a57063c1f95ec6e964d39215364067ebf8cf441267d412"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "fc2a2db2ee757629f55d63b7337a44f4201a1ca6adb8afc9dcfaf2a69b5eac05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "51a9c6718cd7944de69d6d45de96e32b337187093f0757a17b48edf032c66e8d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "cfebcb497a653e85151d1d6e3d87286f0bcea6d371fa1bffd1643d3390d8ede1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "c2e760fe9c741a0b6b9cd5b5111f542a2b3701f69305ae57d1770e38f88f97d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 214368, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "e6395ff182c9dab1e77300bb73b830f6341092864b1450ee555f0ceb55928187"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "ce306590220448a6ad608dc7e0c6a803c3002802bab85785bef053aeeb0cf778"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "28f09184e886f795f61b7c7cec43f3b77c4d77eefa02deff563631e282868ed5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "ce684200b177aaf82c55c445075fddf39cc9308d2eb4c021bf44b1c86dab56c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "e5995f9f98dfa67ea2371ead669f9ea74a388b8128809a64dbd7655463d77e9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 185024, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "aadd8c5164bb0fa9362b55e215ba219a46dbad1a31e85cefde5db92468e2feaa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "4a7454b1cd7dd3516184ce1eb402b3d5c3f1309c2b8606d093e87386a9ff1ef1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 181600, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "1f1e9423e97d7b831eaa6bfa6303a0c022cedda57032b4bf3e46cb6a75309a80"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "41e060fda85aa0c18f998a4ee3b1e4c8b7d9f912efc67ef35571a2e16561f253"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "75a4d1e195a0c8c155637bcc2d7c3f8e4d3468c21f3bf64776a23c2590960ce9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "f04cf0c134bf80418eeb9985d237c4c90375e53a05faee42da51aeffbfccd2c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ff5545e17393ef8eda23c98d0dfe8b2733033ece082c17604a68a59c6100dc05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "bf9570f3317a0f08d25c7bffaf8f4a804d2b88b1b3b2b0b0e4f726059c5b87e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "d60664b491182166d889cede0081526eb9c302921c4945425cfa07c01dc22dba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "622cd06b7c433ca6004be6667fb50857f910e112cc7834efa6be4eccceafc135"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "ce12e84768cd2698b99bbd856243ed03d7e5fc090ad310ae006e9a95f47350a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "f3109a74328ef0d54cb102a912ce1b303bd091733431e81a0b7f53b7ceec3b3f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "559a4bf36e6a59fc55502a891354d3aaee86c3fd0ca7d6624ee05d8e33a066df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "f53d31bda7ab9e60dc002d6a79712d3559ce16cfb7f7da3430deb839d1a2a2bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "86fb7fb84af7351b885b73c03b00ff4e6deecefd9d1461cf51bca6e0b19dfa0d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214224, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "256d0a00e56153b39aef06793f73249979e3c9e15cd6369fd23cf1400c84bc53"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "dd21cb6e428e2cf1d138252e844c3cab68e95a4069fd7e2ee24da341b9335b44"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 209944, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "649bfc9b3b976931bc020112a541916eb12a40627d1297afae2f9a165957f94a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181456, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "9dfdc982ab91995c8b9b7af29920982aaa746d577d70c922db3fc914f39d2db2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "1b6eec2daa19d55759e2b55f4ca647f9d5538beed1c3086371b18530f45b562d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "b938ec7cbd46547e1c90f2de6e53f8e4e5c24d33f092c216a71306479ff1a90c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "8af7239e36ae838405d72de514a2de49dd6b3cb43deabd37c22ef83ea8da6873"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "9e136bf5882bfe1ca69ca66af384ed964a9ead357600a4e0f9b84bb5d94d8303"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "29fea114faf5a199442edbb6dff2f175ecfd61627fc4500fe4320ff1cc6e2c46"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "5f57c3b1437d2b3d373c8df7b2a141d3ae3a6a66e0f3263ac609490a9dda7524"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "a717d5be509558f1e97cf18091ed7b7af0c873613b2166cb936e45174610e563"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "465fe0aa2cdc6cb73c3772c50b672a11d8ea2b0c50eef270299188cf2432ee9d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "d6c65ece69aeba2bf4bd3fa1712ae98d493f602415f42988dc525adb0a77e4bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "dc85c25e12db39381de9ca6358ce1e90859982efde4b317d599da1d9e7c78b37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "6e7a7b013df5bd0fffe38f2bd2ffd964ba573f3dbb12a681bab9e9f0f9fbfab2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "b90afadab62c1100bbf70abc6d6cd6f1e5074a09b7b01ad039065151751af914"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "1cebb0045da74f845a89c7888b7f4cf6ff28d0f1634b42c9248293545c8e0981"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "da1d1022ea762cd75a661529eca25e1a10701ddaf339ca5a991309351c85e743"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "d8b2d7b5bc2a2f82a42b113fa28d2676e3aff8a29b3e4383a62505bb450a7758"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d8128b739a284b89cd21419ed427feaa26ea55a57223e6522c213ea56227128c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "f9eb67878d0e3494cea18a7fb76b311e61526968a4430881c6462a281b2b704c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "31ceae70513dd3cff8e769c79154e6e172193d20bdcfd502a9aba6a23d687562"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "4b9be6f9a02a19d30769610162c647d01e4d120769a05c51d700b3ba49e8674d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "c105ab76bfe2c4d9f538193e6f5224b9c3843fb64a16fe9bdab4672757599011"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "ca77000407a369a57388491fc1e0e9be708f3eb9b8667e3d6b1f777622fb0278"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "1453a937902fc51e609ef2f5d229c1d57a9de1527e4869242af4d7f247ff74e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "4a62a3fba58277a9caf337ebb4f7aa4087b793d7e9531d2afce3d4e9be112776"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "2a787f332f1f14913af3915ed02e31518f07a18b21ec13e0771bcd0c5394357b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 214368, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "1e10ef4bc5fe9fa51ae4baa8fa7f3f9785b5906b2b1bf960d9e16c86aec92ed6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "0ac2ffddaa012232e1add503cdc9854e9e5b516796c55606a6191e6145005b8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "89fe90756e0d89d33f2ff3b2801831a587a2f4fb0689325b96c785ff5a0e5c98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "4cd2c0c1302545d882cbb7099e43cc634394b24a511d95441866eefa7a7becf2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "a0241d6c0902a524a8d23561c2321574d8803f26dd1983eb90579ac612c289cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 185024, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "dcfdf80b738bbfadfd4b8590348780f80e47f5b9a9650bbf8d5012b6a936ea34"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "d91382fe0fca8175c149fda6533b47a8aac873d946239876163af966ca0327b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 181600, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "eabf0e0f1cc79bd678f92c07e86fa3abf7414001cf0e6657d315b7fcf8248b8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "ae74406006571a2da081eaa0646e6099d73345f615b762d7e2647ed7b175532c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "30a178b13a8466ef60d859b31eb0c40c39f17287edd1710ca3bba78a4f596ef7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "0a06f8e3f151fe946128f82d268e68b615cb4a13f11a24c43c0e07ed5b0c69d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7f47eb739c555979476bc5e44634d7b95dbe95771b6557062ecd25be86ed1faf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "ca7e1f281d9cc6717e6ea88ccd2659f1eacd23a471d4411cbf44920498c94a19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "a5fc29e4c39d6d1dd795386cbe68609d53cb4b756fbea0aa3a0e5add494f8888"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "53534880af3ac501cedc62638f915e40b296963f5233f6eb4a6f7eba829b8a47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "2ca5a3e5d206291e54c8d70d22c92a18be3b303e3bdadf08c4032154541ca289"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "ff63dbb22143b601cb25b976058888b79e6f8b01d14a61325d78607dfd27d433"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "6dfd84c9e31d56bc03c000403d6da3cc2b3985f97743f191eacc46a067fc8ff6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "2e8635381d77147dd2beba197b8982660031f62eb40ee4c9ef6d1c20ad78013d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "04f5796ca9b9fb6420d3af5ee95d827244edca86617dc0f46b07cd35079b0166"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "13989ffda87de6b92612880edbde799a15acda8431e5f98f4f5ace0a9264f721"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "73d2a5cb6f714e330e72aa8da565beefaff7bf5b5ca9ac89ebdf4e7a9a994d5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "36f635564deaaef34cccfc044d021ce0e785dabb9221b3008e8b18e4d4ecbacf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "17793309ee3990a3a54e6688323605880dcc95d8769655b3839ee0b1d96efe57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "6c4091b955a48c5c339180019fac1238062391a175705179cbefea6433357563"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "300a53a5b00e9d7006c2998749c35048de45a8044b81ced6580196d90ea72ff4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "e408f4ec34956aba34431c9f0f4f048370c6b4f5378344ee1edd96f64aa4d946"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "a4364d6541fb89cad0815e3bf0422b8902f73e213b46956f6f3039e6ae451a95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "f46ae1d65ee4b1509ad9d06b4983502b19e4ff1f23b907667a7ae9f4c98ec82f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "ab406acee7130f793471c83d6ae952b4f087b1d9498167d3a6c840e7662258e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 199336, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "0430d0ec987433f049045480a0d3d0fcfbc42ce7296849606c415f51fdc7a19f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152928, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "b26a14084ce4e71e5046e22b4e899e6dacda7923cde6bfb87df3ab05441d980e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "6cf0ea4c17656eb6ee7ea96d6b7c6efb37bff5440508503d871dbc5c38dd6e42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "50b57d812839a5883724d4cead8e690882680fa7e3428975f0ece787c8def402"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "8e2e909e30bcfaaaa4bb98f431273ee109ad87f49ef3706ac737b4b71dd4f512"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "fe77ea7da4087b0f43c98a6a784de9fa4790b49a3748cfb81ca7ac2c18a54c5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "714813dd89fa6091f0981d008ca850c8154b3c64fb2fadd88c4b6282219906e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "665ce9378d397afd8ee8f75e08b88890d4e71738a3dc72900faea7aa8ac2cb29"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "054283a1dd8acb45049b7b764803f60df1c3d8f3126fd1bc05b352fe5f0f05af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "553f7bfae94563d4118967370ee8010df73f76b5f3334e07c1e3399217b67551"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "eb3d6715b80e3b7dabbf29b5a502b144ae471450744661934c7bfde147e9df67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "4f91444c6e163a91e1bdae4eebbf937fb7343eb05fdb54ae3dc66fbd6913c95b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "727f73092c0aa644c8d0c271ef72f284459c41eb4f9bd14aabdcf909cb056035"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "d16954061955542ea814fbd199bf4771730b5fcee4563eda0ba33cb6c15ea00e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "7f2e384a0dbfb79313e06fd4fc0e749b418ad6aca3b64d14bf6081c8e2c466c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "4fcdddff04111c53c9c4cd8867d73148c2d77ea8b00eefb3578b1e3d0b6619f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "92447a85483f6cbb21d2e04d35a8eb38eb47ce4f0d2dde8eac549b6925449187"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "60fc296aad5c32efc959da4cf102add8cec258a77051e47072ec25edf5358f35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "d15e104aa7ce31801e15a585279d93b41ad0f0f99dea4de30259ec17abd8ec3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "2e115132731e9ce6499bd1d2d7fb15ec888597102615db01e92b442623e7743f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "93ac61bb3268e3a9713b79e389e5f718f60e45d13042f78b74b83b7e9ceb703c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "9a4e4a5555728b70b83c2c398163afbb07f31c7f94c8adc4894863c2164158cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "51e8c66b2fd2cf188af50b5162fe3e794bfdf86738457a6338ba65b324b244c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "ed527e7df6dea7dc02e8de77b3cac4d77695adb1923343be95ea5e055270cc2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "9767033ace3f0320fd7f1383e2f568d6785e9db7ef0d7f75fc1c7e24a03d9722"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "726c160feca697990121c4ca29c80650e27dd61198c9e479e950059da0e5e425"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "e3c0891b2d93a8535284c858777140f1ba6209ff56670c3d8113859294fcc71b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "52b16d1929db394dde7bb38f27a1246854d863207255b42c43cc8e576feac73c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "fe3a96b5f25d4998178a0b075e4818eac7e4024ea5a74b3985cd390efdfd18fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "3ad18127bb06b5f0d3b659f036334079685aab0793c7805f9278328ed5758a32"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "f417d0035fe7551031a2d196dbeb37afc2313950ae9ac2d4b13658a9243134d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 168784, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "97d2744eeabf57f3f319e85d4730c8919dc728bbb78ed6c8fcc59f92d7d9d84e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "3186da2cd51f183e168bbfd28d81c434dccbc5497612a4bbcd9a861f323d331f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 157184, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "24ef8b80545368ce1dd0b25c5c7c02a10fa68f4c639cb3210c06c606e55b0869"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "bb453294109dbfba3af1e02353def1490d304e787de9f07f940d11616d2d4652"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "7b43b9f484a63c6d8484ea297c79b1b615ecab5cef10e4de72e460052da3c4ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "af9c06789dbfcbf089150c84ec918a99dbb8de0570b4fa31f6df0c17b1def734"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "8dc89e441de6264435f54411470e3a1f0c8e5cc39bd8b5dce7a00e1ca0d87292"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "9891c234e40a9dd85496eadf3c56229868b8aa87d5100736bfdd65da85bb0d4d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "990341cdbff39cbcb23027c1ea8737da0bbeab3cc73d1c3a4c12d1914fd6da73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "4420b5f3b2780a1b151ef3b192efaeb1111c3857f752e96586adad19868de071"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "29687a1c52228746b88eaca854035fcf37f51caf559b88a23c37625eef605b03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "3fc0594935135b503feb36da0d4ca314eaa5efb3f064ac8c2a07d5c6db57b0d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "0323e2af7463f20e32ac579b45b408c65671a2ee5eed429637c5e83125cdeadc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "186202f228f437325e214d41cc87349626e3a54bda934e80533730990161aeb7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "a0bde45ba96158cc16a2d2d858494b9c0f5b67ad86a8fd6f8dbfebeb2faf1798"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e844141668868a0e409502c464216bb803bc1b280dd3578cb2ac6cc2a81f39af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "296e5502952a650b902df250975fb2e3a15029c8d96186d43e09cb01b69ad0b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "a79e68a471ffb767c2154d27693c3d8496e6bfeb62d747a6a97bb02a13009486"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "e0b0dd040c615a1a6a132f07185a1210bc17a5265eca2bd09f584868c379dff1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "ea88884bed914c7b9d3edaf89b64c401fb5587b939193105a89cbaff4c78ab61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "6dc1f8ba5079c720b7449d221e657dcc001c245f3551a6dd8ea2de1e06da9e01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "69155c1e7dc3f84d90073ba0922ab9a15d13298ca091a16b34a8337ad08a6f6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "12a007fc5e3304d8d539cf76da96e4928e2e719d77059775a45278100068b318"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "f28e3f820055e30ee5e2e20d287df836a84c660da416cee689007ced1232c36b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "2ea52da5f8367120bb707bb81246bc34ee596c9ac277cd425438da45686d0af3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 199336, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "884a505ed7e8ba76c4199f06273f505010e88ebf27837674b9e7dc10ab9ae818"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152928, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "8148e53583a62108b3503c0da04df864f1dfd90a026d4cd796e143a20fa0e83e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "912e373e8b8ae08afc607ca1e4b9f0f55e3b66ce321066d02f3e64e30095754e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "6f09f4a50f96f71cdebe540a9fe6c231fc81f1b25814354d657c1de184f65ec7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "aea250ab2f16c682344aba6c42078564e1d3e9e65bca5a3eb2183d35f1d027f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "7487b85d8454706d96dddb7e8e06c580b9cb4d25f3d3c6f60049225631bbd591"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "43775bba2f03b3b90c21090819b3a0c2dee5a518f49bd74fe6ceb08b72f6b040"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "8e331cc76229547037ee1d3af5d32990cb36ab7905f031bd6f502a4802265ab3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "03e00ecdc10e9694db5ed9d43e0e151f941278210617377b86247efc4efafb5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "dac0fd075c701fb79080985f5bc6cd2f02a0bc9117959f00e183318e8cd98b3a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "ff34492abb82a93a4aa038bb67ae5db5bc99c1c686563c57ae30b8ce7fa108bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "4fe9b54e780df01736584f701a237bb03051388127674dce9496822ca3dac9ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "ccef97b298e9dbe090a328dea790e62e406d9e0b87bc24d548a0bd714c1ed556"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "94d4e5aa3f49d71dbff2becee449549212cd16cb914d55fde5572f73ab8afad7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "3a3ddd6831da3700c8d183e2152f468966cc42744da6938efb94b06215aef614"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "15323cdeea35ec5959dadfd3be35f0a1e143882215b572a7131714ab073e30e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "e35de7e8cc7508577ab4cd03c2a315ecdadcf56b6058cf2e044efc5796515eb1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "bee97efc58e00b1ba06633221941fb62538f05ee13f119e038bcfcca17b9ab49"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "5f42030a4e945be78371795f0a303b126413a3bef61ebf8999cbc722dbfe0e17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "a8d10c88e9a7fb8dfebdeae3d1c00deaba4e387d554e4cd962df9725bec5f20e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "935820fe5bdeeb2d294b60ce589705f9057baad4675e191cdf8069d6453266fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "9d296d960efb94df3f4374a10195e2d8cc5e2af032471ae373939622df3f4349"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "36cf1ae078d9870399d02292244c55ccec38ad5d988a86b0903b470ce19b56b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "427ff15532ec3debaadf5f64d7c0e4fe534ba5551e80aea35ed6f178b6cc24cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "b47adaa1e63bfcd5a6e8f1acfb355a496b27883bc2bfe790459dca883df5e958"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "3f1b4a4be574b0a22ad58c6e1049cde1a08e3e0b1157ecc52a60daa6e7350f1c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "6c1746445792c056714cb84082771be616e98258d0f53d00979e4621ae2c517b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "c760715a11756c3fd7c9cfab1f41d90539c388ea1985eabc6da34a96b9b4fac5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "7cdeb6bb27e8e426dc6ddfd8c8528b47e8938e49d49fdb1181f2df53f2b7e387"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "5264e18d4c5f43d7919a8dc528bf95f158748dcb3e929216330372d8aa36d9c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "8f333dcb63de3522dc683b96edf5d9d15b4b25596ad0b7c50070d6dbe80c713f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 168784, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "59d763d544c57860d8b5238d5eb672f7d8b4af40647751095a8a8db560c7c11e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "49f9afc47a609d929e9a10d0b349029ab874b896f2d2554b40869c914a3a5a35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 157184, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "f4407d666151ae69f7ffa6ec0c7e86249d8624f261dbd4037d1d69784bc6a793"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "6bb128f568a38f8dfb12e02c0056d0722a766c85ae7602f0df71763d8bd63f9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "94d2e5227accdb5298c2247d683d8984712971395dd36a80f435eabcd5bc884c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "4ebebd1c50ee2f92ba99272a36475c327befe324c9634e3f1bbd5e92cbb69520"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "22ed1424cb9bca655bf37c616501fe5ebb01874350eb0c95a08d2a146b14a366"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "b7dba6e0d3bee293ba37ef11581511ca8168421608b032d10235b34f88784d89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "b76d5046bc0d5c3b2f3b8799d1407b2bd6e6337f6ab6e10ce2c65d79fb25e46c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "bd990d5f65eb4235e5a9d65e5d579c4c4abeb0f3a2752612d009a090a73b6d78"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "e4702d5523f220cdb0b7648acdf261e5c9de6b6f2dafe3894b65b2596af70a12"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "659b07a378c5c1268c2acb38a454b66d22b742672f21d564e960dc9c77637529"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "108bedc070a6bded96b1c946ebfaf2dc668ea986b24b8d139d21a04eac4bc5b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "b3659d0cf9795741c2af6dc6be64a20435c061ab69f605af6f3c1a42c819b13f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "9a7def675a29bb1f64149890344a1242a5ab3a2b0a7875bd062d90840b581fbe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "b51176b76b2725534344d033d3f97ce595e7c8b8d610244b3f08d3a83b3ab914"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "9255a8a3abb3913d2f7de5ceefcdf3b4b1a9d2bd57b43cde531c1e9938a2fcf3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "8b155745cde943df64ddc954f2c494e35ace275230d83e2a64b38b965a5bceb8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "8af7aa8eca69cefddddc4bbc8caaae2c25a7f062c774f2bfbc0e40f295bdc183"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "2c628d625146434a63230df19e1e8026f0a4371c091c932f92e3cf8c42a63e90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "200b1620ebb887b051c9e443bb6e0cf8aa4ef6b94b3e99668d8dcd5eefb94ba3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "fcfe4d14e394868a741c3f0859a44c986db8cff936995ce367bea75168401eb0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "e89c0340568bb897587f6f44c87144f7fafc51b79a4e719fa059d977dc78f942"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "6569d9734240587828c3d0ee8dbedb7bb2fef234bc710a6a72a1d2fdcadf55c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "21c5fde3292d1a67c554c6ff3d64ee7a0e8512217491cc6381c499dc16f6f2d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "209ab6049f8f18f23b24490b0a7f9d2fb85b7bef0a828473ba26e30236ebe4e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "e83f50154fe9bb05904806cf6ae96ff9e680e12e86c64cd835fc112fee85441c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "41fda2e1ad0073e1e02ebe1f59c70336e050638d181cf33cd4177e3258fb658b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "821efea34a33c8dc4d6c8c86c2783f6b6d3bd1da3ed49385883e91129dec5080"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "afa5d58525572e8096ef74164a9c3a5a733df744ef76974d0ddab16612e64e52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "208677859113fab9cc331f176fd4af8128e7649a048d0618aa41ea4340379245"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "018775f0db34cc66e8ac55385b43c18cb780b7d62776c5d68b07ea1068723145"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "6f2bd32a6e41eee9e22173bf7359b99f7940862ca40a9481d938e6630987e9e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "fcecc67bfaadef29cfaf42aa52faf6c604c22ee3cf25d41ed2f20838e8f80072"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 209176, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "fa60a7b8608f60219cfc7c23f57cbff45566cde8a8ee58f2f243aaac4da82b99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 209320, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "ffd438775da7764d966ffb26df6ab9a2aa0e5f860c14f59d4635b82513e30af9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 196248, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "71fd52959fce7ea24c82ce31a4b751fcf26e8026b93667f97c3bddaf65e5d424"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 196392, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8e3181e7672e950fb746969264d78786ce9d1d96af183cad1f542f247630cb05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "a99a6c790c7f9405ac129f9b62a5f6c238026a3e05917136d2c7ef8385ea8a27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "5517510ab52e6ad0e2aa6e72125d494f603dc098ebc564cfcce43f1c8a199e63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "f66cabc240c811df21b45dafd402d16042a8304ef82f27487ba3dd4081d705cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "eedc58ede6d6a562f83939606f9b7f1c6f1cdbce736e8776714fb5db8548838c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "2dd9d8ee5d0ed8a92035bd65af1be79556af372ed0f807aa223774933ddec42d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "9fa034a12529dbbfe9bd5923a62b469e1ad471e59ee6cbb658ea8edc7a02cd2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "9c10ea4426fab9ab15f12fc868232e9a1257821dfb6bd9268a8b64fa9ce6a999"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "c2038a4a61932ad3bc0836811f775c0b400dcf35e29c68885f489f8f4cb09c6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "3e7ede36b5b345f77a7500b22485e60680c8cc99e5185c607e07b53790530fd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "a931efb9ebd3423da6f5dbd3659cfc2d319e5aa608e4cdb27b5fe4e8a17a9e9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "4e4a82df2e0b5064b3d9ed12664e9efe7740d4dff7b52375f750c32a48db982a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "85dfb84421bc1393d7630926fda056efae5d814888d58d1e4699d06e976cb7b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176624, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "e70d3b1a448b316b4689772a8ca1e740546ed7e41d7faa35c6c0aa3915cbaa2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "8c601ed58736c6da55964c5ee87a4694383c4f3791df8c0c35fd0db7fec03fbc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214336, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "722bf532c1f30d63ee17344bac4d6fe8def7a8e30f13f8ef8c75c79822816b79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "9cb0884d5be1fd6dd7f80d597eb029be7025fd563885a7abe6f2ae8fed4a6692"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163680, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "7c1959322f0b4ec2f5d3d0d083ff4fdbeaf61ce137db09d1c87d637fabd0892b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "4160c34dad622b2de6d447ded37cda59fdbe744324aafae633ebe7762cdd57f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "6f872807ccbf940cdaaa1409e82fb04b6d8a4770495d6d2225e3502dde052688"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "8955fc9fbf1396b77b28cf19fa72e5771c15973b372fcc72ad381d23d3c412f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 179648, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "8efcd6796920e43680ab1137c6600b6880f921e073b8425dbb35a4b5c210991a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "031c55fa2418f0c473eb7e86db0a1caf91012ec2693e8f3423804d77bc6dad25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "63cbb3a66660cf3614c95c71ed0d847bade754dceac30bbb72cb1f863afdaf49"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "26eff0815c1f0e8fd305ed986e55a53500546741fbfc3661ca793ed93153d7a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "6b5eb1f06dd6a86da0c323daeb69fad7f5b54973358ebbf5f329cc42dbc94ac8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "5b290e4e53bf5d995c55b810a52c00e2ec01116e99181c9f4b65c38369d81047"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 164672, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "40ecbd7a4a75b2b24c0cb94f098f57741e3fc2d875ca710bb66e5ad82cd4ba5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "4f097978bb2a01812de9ad8912a35e1fa84bd2f5345f9d0c96d03e7e4000f43d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "c3c3f27c796c7b705cf747b9955d66eac8b48a9e08cafb72f28ba7005d8a513f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "ba99f4e8c6ab0445b334aca65d570ffa77ba8205f913eb150775fe2cd79c847d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213160, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "2ead6675ea67d09f336a79b4a01ed17fa6f37e972694e129b0e65f0dcba67a56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 200488, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "b90b01d282eceae6897230a53e44f152119406e9159f21b9474c5532bcfd2323"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "97c4c75f1a72670b146e275ca31e6dd5f233c0ae782c9f6f29baaaa450f935f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "f255c076ea9e71e883c4ae64e6246da1ac9d73d6a269e29d01bc73c3e5b4b0b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183632, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "aaf6d448034586ee5b9c030966ca1ef6a51e180821ab50df82a8a17053a2e68f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "e864b7f283c442662c20bca9df1629d4fd20be3b08f270eb973b5fa0f598993b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168912, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "76071e7b55556e35a397bd5aa98accdb0d456793444fde0c9d7e465a27933b3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "f268013c9ecb6d0ef515431259e514f5bdb32704d6e5506592200a6e83ac93a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "eb20811c27eb855e937f4daac2892ec65a05e568a13893c7eddc6c30f40d0daf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "5d7c3649e4f1755e21002342446894de1a639ebea2d1ecf57c2232142e434024"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "740ca7992c95c23fb3eb64f90c96330de9d359e0f651fc212c2a2a0e6be81f11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "9f473ec4adbee0968caf363712e3f988efb0d1a13698295723f64371cbe4af2f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 208664, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8a610e847436347473ae85e56766a8d6963a6f43edfbd0bedbee688772a0e813"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 208808, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "c74b161dba4da4659589e94e7d0591b06973b16cdec8b00755fc444065c46981"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 195736, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "33205dcbd436ff0172801d2199d3bed6a0b231b1358bbffb94145c89732f2ad4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 195880, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "cdfc1498d07faf85c74dd0a9fe242cb73eac640325365d5a562fefef6b83ec83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, false, "b018380d835eab5bde3493d593076cb31c6519639d5de98779257474ece005d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "e9d77378003e64293d4bf319810bf135a5474fce49cab41816e7babc6e634196"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, true, "7f6a03ad9af337a83e24e069322bf1c1a87141441acb3d81b7aacd3ec6521252"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "23c33b17cd43b789ae2deb512394270acd1407119ad2de11937b2574afc4e236"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "d8affb7c60091573f1a2165c5ba9389b5d1606655c18c6fc9629e57977e3cc22"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "19404314f721a5b7fccbd6ca171151bd6077ffe49742e33c48b354dab4b10d73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "2e4cae875d70fe6139fd451a62b40fbd58a38623c000906a8e8258a867e1d3ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "506e777022e414644542a14457cba086ae8cebbb8c1f97eb8e60b6a5ce99f5e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "810f362fb3cc656590eb1839776c61e421d69e49721e50ce11956a2bcac4be89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "9f5d34f083a6d1c2570843c981436d92789ae72c13813f19160369eb2ff03fbf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "aaa52a36a2eafed1c7361e866d274747908da3b84d52871864089ed03e809748"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "f61396cbc750b64e462f7e69aaeee7473b611c92324f50f85cb8dbf2460a741c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "17526f71a9cf16cf8a8756bcbeb3fa0337cfa1408c658a5a73370ce143c20912"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "35ab1e7b8019bf4d1f7578664636fa0711ad09c1ac3219289ed3b050c4e41d1c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176624, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "d25424391a7223369ae8b97a12fdb8d6b6cec76293831eff4bb4c1ca985c0fac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "da2d6dd0cf8d5cfa2f05c578e305cdd89630c89bff695f799ed72f6b606c7046"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207608, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, false, "1321272159b6878416d237c111f6d2ca8fd954b26c6dc9523d10be40ae4824ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214336, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "a2353c637f2f0a30f6cec0bccddf88be71b7c267b708251a028e2c4e03a78964"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, false, "b3d46f9736aa05d87fb04a2358102049efd952445434b4d3b74a99994e434382"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "5a4398f62b09e1760dc2e1087f09db64186fec3ead5a0f4a6cb50305cd2f08fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163680, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "8415aee87bc27e5a5c04ba908fc248a5ddcc1cae02bfaaaa784e23370d2769cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "cf727e41c4e5ae1377cd83a8e098f126381fa90d97cd5fcb01de71aec059a541"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "c263590dc883347049bafd65ee02310a8ebf2d551730b9cdea29235c13537da4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "c1ad754a4463128e182800e1e6b92144057c7b80d0fcc3c94047f1a5ef369e44"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 179648, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "1ae53164a44dbb4ea66d4102d70cf7c3766003a96e7afdf81ee28c2573782d95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "393edecf6a6237476d5300a9f282a1ec24995f5c2ced05413a2dd19179655117"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "2b4d0c475515e23b2dba927e2ddb6c7a55ed62014bd324b63c985f05b3800f29"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "3d333e4f1334f4f035016c422513e0929550fc12cf747e5d3379a8b1e93c5f41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen", 207624, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, true, "79d67f13a6645cd33f35afd95e4227b72e1b81cf70fecd3d7a6b80c24aac4933"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "f2f0fa081319f34a66d0f50b652f7c85b9eba2e5798392e092618588504bdb4d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, true, "8509d0d1cf9b0e41746a6fb5347ff9dea869f5d3a625a85fd176e8c5c04abb6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "54b727c82f7e6ce444537c4c1dcb8b4776f3c5b647e6e99a2d9c32553c79ea18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 164672, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "e1fc5cfdd26bff06029df207902234ac7e912d157f686f9cb995e19bb1b08fe1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "3b573c5c43d2f317879f4ac37b279d7945a10412990a3d726bb7d20368659403"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5406a4e5f9257cd495db6d6708622799c2208036e8a25427fa96a5a3c8013499"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "cdc0a0ec2edfed71d455b626f1c817ef7a48adae14234a5577d23b7ca1337f1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212648, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "9bf0ea5940f4c3c3f7d890ec71257793463f5cd13b5702590ad8903dd3311184"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199976, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "e4eef12ab5d036b21ea4ba0e1180d1d12188a230bd4d7634072b77ac3738e596"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212824, 512, 2, 1, 1, 3, 0, 2, true, false, false, true, true, false, "a36245d23fb0224c1c6b8a6505e6c812eaa0733b2f96d4a0738f1c32f2b8e0a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "481cdbd3f2cbb833a6ce443eaab6590c33736a25965db5301ae051ec45202a5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "b4b5c3c80df851fe889dddd0d2db562a110cb454d6ff7671d8197223254a10da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183632, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "fd32b9013064ffa6d469e1e770e4b7cf6cb8dd519f21fcbbf8186c88358ab735"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "3246c01830e50c5213430d22da6b796d7bec3300498d8c90de92135f1b085ebc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212984, 512, 2, 1, 1, 3, 1, 0, true, false, false, true, true, false, "d292cad73dc7d63b07c9584868356501b609597e7bc56a34c5c8323c633c58d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 212808, 512, 2, 1, 1, 3, 0, 0, true, false, false, true, true, false, "5ff7c8ed81bd20e309a03abec490a20fad3d46f8302320fb907dbcd73758df7d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168912, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "0501cd815140748ee0574c7811b0cc72ea7774853d645794ebae0011487b80fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "81b463443f31daba9a692b67948c4c28f95db800ec17033997c79ed751d6546d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "85b6db7d3113a987391e21451e88d6153bc5f85c19f6415c41d2cdf451f43e95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "85eca4617a10ba7fc21e59d8f8aa668da553813deadabde33ae7fa747be05e8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "09817fb1b35635d6bcdc521900f89483f7943c9ffc4f39a49152d8a42ed3fb8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "4a78e8a5e4ab56b154ebae442b009f5dc6595cd201cef330483ecc4dd4ca8a79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 208408, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "3fe190a2d67f227f80fe28331717ae5625f744b8fb41ad320ddb92a386f284de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 208552, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8398aa5753656cb763b7fe53bbc68d62e8dd6b3e42997163bca64c4b13689aeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 195480, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "038a7f362a24636d144a68ea817ab26636feaa111cad8d755f3a87429c47ad9a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 195624, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "c30da4cb2d4d73334ec2a8255628bfffbcc3c36dd75b62d420b263695eda75ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "2f0debc8ce3908e65f6444d54edff4d3262f0a366ae9b17e2d14d13b77b03893"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "0b6dcf16cea0a87640df0bea52bd57293f34403461bd4591f44270e2243272b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "8d7128f069acf679a12df44d5b97ddca903120524dbd2d89ed09aa2f2327d013"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "827df61c2fea728ced25e23bc4a130722ba3360356d44e2117b2366e85e81ea5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "7bf3d3ee82a4965cd6876accfc24b30126b2a6c83299db5d14c4c776d2a1810d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "e750d016aff8621f71f916f88da01e947522af97c8f22597a284a69bc37a12ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "353c097bcbcfd4e4ff8900513ad6d5fa63e126c6bae3903768f8fcf44e0a2118"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "54cd7134d4777c13e8fb18cdbbc738cf56c267428cad1c611a362b445ebf6b2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "f6e964da261e7425446f1c3d2fdbffbdf014ef0fccab1260b62d8811d916c841"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "31c8467c0b711da443afb379d4cf906de8fca65098c64b2800039d216155e596"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a416eb6be30535d563846eef5293f36b181364e4640f16f04bb9e12bbfb9095f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "268267e055a10cd0711dfa71b6556882328f972089c235ff64403c45b36072d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176624, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "bd62e77f9e3ef3d07fd4c575a509db12e10f3527ae8798577808cfda82cb0376"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "c027b716f901a9b0bd69683f2dc0cb5b09dbd9decf0e10a752666622d974edaf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214336, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "80068ac8b121dd4f3a9dd9add5a4a9a92cc7eaed6724e331194bf943c4217b7d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "8d5708f01f6dfc28c5268a8eccef8d202cb591d26e100bc43e2916a13808331a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163680, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "aff0a4610b4d78f31c89c810e69faac01389ae4c75657cf6dd0cb6cf40b96be5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "1077ec5902ed8acbe8525adc7dd7c27334106accbc2e42e19d70f104b87e09ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "40a9cd952454358258fa2e4b8359f67524da8e1f9ce2af9efeeb778d459b4fb5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "99c26ffbb58d299e265b6200923d813e4d5fc8d0f88fdcb8e084ff6c62794555"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 179648, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "527b9f2ff07e3e33a528a2f4d8cacca670f3aa6fbd84ebc5a049245c78b8a5c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "9ef308012fd3fb7008b6b0146aab433fe29a03442a8bd2fb209a762a84ee8e14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7fe74d0f9ffd8ca7334cb4f677cb0c94b9d1998934c6911407de552e2af80a38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "daf26033cbc1d61ea607bb744ac17dd6422aba34f19d5eddbe8867b59a2ba838"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "235308a0472023c75cdd6286bb337f407f674702866f8c3adc93a5e199d470ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "db17311e81dded985bf7c3c3d03800461e99990c69d85a3bf362d6ae6a1ac531"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 164672, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "b19eac9184433c44154ea6cb31ea97d14f39f1c5c04720b4e0227609f184d96e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "80fe1e9a2c39f7cb8f13c7cfd72cb484ceb7b07dc346b2a1bb5bb6068ee7ad1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "b699f3d235c13963a3b31ee18cd72c1ec90a18a6d3277fc77a5759ceab7e6fa8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "cdda4fe65d456b9a0d02b5dadd06b6f8fac7064bda6a529248d0efd3d8eb48ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212392, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "2c5882b7e31924e934e0b928af6b066fef56077f29cb60b43f858c323bf28016"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199720, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "52da8d635d2f88ea53ab3df96ba2f34205a949a99aa34f14f2e4a2bca8e7a2d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "937e3d4f0aacc3cdbffd1e917f9d7ef474493619adf928633511be5c50ad54a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "9aafef7a45222e132cef2e149169b24b66391442740e200b46cdfcbfb68f6270"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183632, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "3f37eb7182224e7e3cf50027164e7bde22dcf73c1571aa7341b39ef0aaed67c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "bcc7ae571c1992642a7550ed8f7df696b9089454f92fe020afa1072bbd92a75f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168912, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "8812ee2e11c14240f72f480ed19f3e6a85ebe4f88b982b73a0cc0bbe230f38ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "43a7192b03e07f60f4716f544a77e418f0f021e542cec4908dc9df301a7218ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "44bd420acc0d55903d48d6794de7a08c1a1f706442e6bfb10a73e703da2be6f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "9fbc3894562576c163012ee7947a5fa625afb9e079850ba83d03049ed8776855"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "98dfc8020bccc30a7bc7d29e9243adcd444e5c3de98d84e8ca3928b8b2d9554f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "c7ddb7e2c0f5146345dd753360ff02f042eda54cba9a98add5d4b7b49f98d9dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "d5f3b08fdb2dcf7503a2bb3de12313e00dd1e4f8990e6a600563a54b23670750"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "2637a7cc3d8cf11871a1232cda111d3ed8af3874d972b2a60528c575d9d66d5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b73d319533ad62daeb143a756df292c446e26121ba6c1b0dd0b55dfed4b01e2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "83ef5216ce240f3154d215faa89e86df477ccc435793c0a64ca969ce81644eeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c0d3bdf9d6cab4b23d684a0c8cb0c4c236cbd0496d9345c6c74616a0fc4fa44f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "58bb72e9b6a18f2cf3a729a157e9a2ac6665597b2962e1421ef54c3e3a59e007"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "b009e33ce7e8a58159c780a763ee0042ac07e7360f301ecdd707a69fd8b3cd6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "df499342e866f4dfbccd5569ba80f13c7b2167cc0aec422183f174e32d7a8dea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "84657bb40ae4b7d7558e1a23e3d71d61fb827ab7295892915f86d3982e1b1d4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "68ed8b5b460a1a291b70634d64776f7dfcceb27a57786010d6b8faa6abac03cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "790a87b1a4dc04816965c19ac1a1ad78d38544d5c254baa6282e809fd5acb8ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "6fb9e51afa50aed16786898f08ad4be49b8036538d1e02c074fbcb11c17d76b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "fb38966052452966adc01d224e04f8a9de8760eae50cd622a100ef9f0b311c10"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "4f59e6f721849fb19f39c00864299f3ab9b5323328a032f6f27dffde1a74fc7d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "58b91d89f5a8bebb0e362a5f4c31425fabfe5774271ce4d592b28b2db503061e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "a687f823796260c5535987798e2f2591cea0b609c33990ae4f480b18e2c0f30a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "47a9db954ce3b11214b777e0c0651bb4376098faae2ea0b005d22e19122a2875"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "d8923902f35a974d9bb92e26b18296504298c386446be3f4779886ed2818bc74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "2e3f6440786941e303cddefc281c08ed456eef4639549814377ca5f4071e207e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "945d2ca53f84728c8a81dd4f57afd80aef48cfc41f002f171f993da01cf1fec7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "043988121c90e35ca795550a87246208a16bd5144ddf7d68414cbca19d19becb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "28c0fe5232d2301ca1150ef99a21bcc669537ab7b312393e6394621a2eaf2aa0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "e2021ed3202b69157c57802bb4c5e13566008b126918ef706103154f165f874e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "a53e1f0d5d5d262d834be199081f8d859dd90b2b34e83fabea80cb506e279d10"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "d535ae8b748b4f93bcf8e206b3108dc63e2c4b6cec535a87791602654ed012a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "f184270b0db6ca944d076071f0e8e1390b6c55c2d554ebd64a5508028dbadec9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "a7143e83e573f2041131921df610980e3a02027422f1ef4f06d02fa9dfa024cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "8a02c287964215eb3d25ef76aac9b5a8a0138ecd140e9d24103b9d628d68cfe8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "1f540d5efd4fe0e0c0bca724b886d49243edc59964ba3917a6f5e921c9f33372"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "a5ad300ffc48d7dd12bebd70b40efb7fe3e3acdfa81432ddf7b556d7238d9ef3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "8aa8ac93ab43d1c3068834229641785627c9d701390b893f112ef70552683555"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "133a947f57d45059718faccd54bab3d1d1cad14f9ca56c15a205fb898e649299"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "33b41c1f69dbc27c1d4d010e25f10d6ced1b35bb635926d828934c78929b1d5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "32d6c5893b50a792b6c788c6947b3e83dc2aaf79b1d2db17805d2435bc8844be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "b7dc2e55df0aec730126e6e15637fd198400c193ae8065c3656bc348cf3d1baa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "1e4b7d5505681f895bc2c2e6bb9cde7514a1d3cb5cfec689f34fe2eb3262fae7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "d7b77e1dfd7ca4b1159ee1c59991e1097abfc01da74e0ae1ab53ff4042e2172d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "c4fe2d39487520fc34d5b8295d7f0c01a79218c1859f793b123c6568462955b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "9b7092074c37a73b99db3319ed9b7bbf2c61b20a17a62b39c30aab683eda91e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "3f0bfe825a29a489a1d7fe6c69914487911093ea0d1475dea7aaaea03d65c511"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "cce0c51d9b65d66c27a0b59cedf638e1fc8e34e14e6b5006f00b5b2a79b71a96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "95b4e06acb672667d1e6840fd9efbf5722c4c51615f86d9e0b1c3b97c1cdece1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "b070c10610a6a9af0ddb4e3951007eb84a5f5ea161c71fda2887a1e4ee9592bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "8828f03ec0e6156b8fe59ee587deb7e71e9ed6d39627b3de756f43575bc8584e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "de26b447b8a5047d47cab9a1b27aa3319301122e804258c7e62965d4ea9595fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "9050241e4fc95fb7f6286a9fe61d52b51b394e3e5bdb510a7c8bec2c058f0a68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "ad9348a1865bab4fab831f0e6be58f2b8fcbe562de56b247599412d6a69cf168"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "77a246a5206deb0234ac0ba1df6b8c9deff2207c1b3c5523c1469e75225fde26"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "a42692457bc3d286938ae6312d1cc9a70fd7a2f0848b18a20ef4086caa999179"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "a84d301ccf5a9c5fae1606de6ae72aa6e5a0c9277a51980128e10ea0c5dd0816"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "a37b1d300facd688f817f8e05e50126f3da4fbc86d26059b07fb5caafbef79bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "dc07884c8a0418d11149b30b168c835268ff4ff8dd72177a901af8b68f40abc8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "9917cdb36e95a9f960239b4514c315fd2267c6d9e00816d13665706ae9769160"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "d2371ab9c8e7f5815b3709311749ee9b210b13d40163e516c66e82bb0a0a938e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ac056658c00a467d242577e91765fd66601909535d15f1396702124f1da1c106"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "11661dcac228d98bba3d12dd60a7eaa89e2abb955b6f0a9c1f4b1d9a826d4ea0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "48eb569f3e94241c8a81ad82d4df5b8925a3afaf17bbfc9b8193b94c1fe60f0b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "ea17a32ab1d4b27b37a62d09807c6f9aba7ba80a5eb0938f799849dadde420ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "cc66d30d18a7e86f82a60d0c83508d6e26bee519977ac7dc17971b58472f5b8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6626bec7a28518231137ad5a01ad9253d154f10d84f11c3869c483bb73ea9b74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "85375ac16f71642144a83937de478df2c0268f80eb4f79b9ab9aec67f0578d6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "b721bd2ef38d2daa16e73b38f148f1bce77461f5ec90f0a70dbe60bf01c9fc40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "7eb12a8d08079eb40083fc5ac479a027c04e29653b5f6f89286418527e2d359b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "2a1603685b94e7f14a00bded2fbcc1f1cc6ae90c21b742cd5c287b415f46cda5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "fc13df492a6722f977475cb0384e92196135303987cf7e96c425f86ba0b615ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "96ffe10c899421c870228842c856e275f2d87c2631625afa140eadc9e433ac05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "359bc1c47d733e8bffb0538d008ff9334b9f91c098340aada6f7adb779ed0e6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "50ba7efafc5795d9a99fb0cd64b8322749609337982167b4403fd74a3e526612"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "cf8a0011cbf70bb0bb087878b77365221791ec4c0b9520442c7039156f86eb22"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "d64e7091ad4bae2441ffa3611d98329a0c3b812e153a47af173894029057a370"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "7e31c999c849b63ab9e79ed562b8b7ab76f7c6a62ff3f258e2afcb92a7ace063"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "aa2776cdb8f9670eab9b1fb3def03a83aaf0408d6cc4eceb058e98568d2a9582"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "8eba25e0dca384ce14c5e66cfc99f333fbf24f507b539faa44e09c31a075fc50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "aaafb67027066ecfb98a37b0acdfbd045e18ef5ca202290379ade2ff34c30cca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "31eed1ed1575bfdcf48748fa77bd3745a2c957b102478b8f8d9133f3bf29be93"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "20124249832ce1a82f07a9f33a91d7749687cb915940b36bdc66143f8d2fa815"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "b36630d5d8d32194e80e680f364f734cb5fc2c795fe5004d22ecf3a8b77b8a66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "39c6b96b0320cdc6791dac9e51860b534aff43ead1c6b1ce3ae331d49d951849"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "92b7ed6d0e1b592aaa95c529e5d5577115ef5ed18e9333f55dc6a1d536125919"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "be19c27b459fb07ece6107888f00ceb5e611f1890a43a3c8fa5ab9d3c67f9045"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "b511c5131cc289714cb68733098711d7333a3e7cb35a7b54198fc92b12ec30c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "f4587d881c4912155e8d2484b4b5edfcd83f0db58133ebfe8d6b4baffc91a6c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "415894fa0228fe724e85b7294265a4ba5b309d6b4f28948aee4c98f4d32d5844"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "8f8de4b52fc9dcdbf401cfe9949f623d7cdb0cb072dde42cbb63abc5b0ca3e04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "3495f093e63327e0dae69fbd0cf6396224a8c2879b7f7c7969efa498905c3499"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "ae7510565d0e4e33f0927829108bafca54e2a8d0f5236fc83e435c7574588e68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7833bf743d28d9d305ce02e17255a5d890a77a1e3c7f08293d2075f0348bad48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "fd21ceee8204e34011ed1d10a73d7f2798f05a6be7d8202ec27b0988a193650e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "655f2b10f7d8c653ad2ab42e384acd6aa01093a018d7b898ef392706fe4590c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "5745454075d4c8f9afe2c013cc0ef231912ba0a2dd46a1e66d58c8455616776b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "383e607bef7cc8d21b2b8300ec1cb1aac8d55ef4f6ab2337689a358c54da271c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9f2abd48093177f1c140436f3578f5c8b93a1a1bafefe44baf85d8b047e8c8c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "e6bee1e49283dad7026ca94fc33935691193f0e72d76d038f0f8dde5055b6ec4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "024e2308bf7182602a7ad16478319a0db455007480e9f6554d1fc48fc1cba4bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "c8d78b3a712dd6a8b5a7165f56bd1f17b4a955e3fed23252bbd4810d5dac3b71"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "9d405b5aa15c7f4e08b8b248de64c0fd2bbbfa9d06cb4450a0d1fd1e5d0eccae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "b5b9e5b7d178939ea166474ff61724a916bcc0a37162a1df2b65c1ce306e9fd7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "be30cb01b63743f5713174ac110dc160a05b1af46965ed5ea8adac6aa20cac41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "c9b7d88f22efa8f229fbdc3d419b931780ef8a9d07ca0f3fcf3c4d6935cb6612"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "ec939a5bf2521760ff89eff9958ac2bd4ebac77a0c8beb5f17e7c3bbcf149769"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "6e7ebe60fc04457e5c2891eb3b1e62b18de288caa58f4501b10ec2f1989489ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "27a4276b248836e199d7f2c1f1c0ff9c62f5016c62ec2065cf00f2c571afca9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "88dc23b0ecf75e02527fa913f0037bff6df1fd731f837735e366ec1f0de375d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "41e9b3e42f29516cbb6bf9f3503b753dcfab373f456efbd101a8190e47814093"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "bad346660337963ede4af6a28e01f9aea46daff36746f1ec519bfe48d26d39f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "9501c8c5e34d08ad9b03ee53ea481df186739c73429abcd22d4badb328927f29"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "c624d5203bb15283fbadd8e3cdb4d53a075eed145ef5ec4749b202f6bf136c06"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "72c3853faec6b8af44622854f863abe3a88aa2b038193f2ad61e0a608dcd57c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7954eb4c27f37969ab50f0d3b512e539b54242a6fc16f4ef6c738947a3c1faec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "b1b27460ac5560acf36cfbea5b1e70dd1cfdb2914d04d7ab19407e19bab60e2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "2a24a589ca9e2a10b10b590ce6d2de468bebfade6a7a2076cdcb4fc7d6876080"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "02e0a173ec86d9fa329723758cc4dd27a8d86521b45830479e10165cf5fab580"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "dcc534c2b9939de68190dace61ffb7d3624bb29f12f8b33cde31085d019f3315"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "5d128dc2a382dd539f8b561a286e9a6045791621d38b6a8f4e28f70923d3118f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b3542c07e81ee5a2f1e3eafd87cf43160405ab52cc8c6767f6e8ddb3cc7d91a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "9170663e69b8e339915163539ebd3945f9c07afa9f2c6c067c6a9814cbe6fcf2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "ee1b059a598c8ef1f97f9d345d92d25b0cbc6afe7f6448a0202f6bbf87076103"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "eff54e1178cb187072e5d8a49e9c421c98c63c702bc1a36a55436a64196732a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "949f1982a83f1a47aa9d3e63d52a1fb86defa414db0afc9dcf349f8aaea987f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "422d4180747f859f3988ac3b5518861f4b14d011f8a3fc1d6c269688cf41dcae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "d396c459367644c96a89795206d02682fe415ba462da6139298ba189ef961176"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "2074b9f660fa015684ee4fcf21074693e69d421db4f4017e3c0ebfe9fc85f923"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "3aa3e54bf0664a79883717bf53600dff82586fb181e9c39fc7177f736c5f9639"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "81024ff145f7d6ecf1b7ede2d1adf7365a33b130a336ca5b2ea3d2ec89e7689d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "2524d77fad204316f0066741ba497b0f4932a6855c0d43dcccbcd82c08f4fd4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "f81db2fb1bc64c9f203e70e6360a3ece6a84805a77b3c27e2ce57d9de6f027ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "ce4cd7850857516244020ff8a779683e90075760d29b422192a5ef7c65a9b58e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "709969d667713ba14e509c55a763aec529d731c9df0a2810eb768934c24aa775"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "119f7852c8711e578510ca62ed2cd44f5a7a1abc9665c26f7b10645139d64b62"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "a896462546a62e5e9e8f066c536ca144a7b91794e3623da1e0282f5bd2cf6850"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "31978b6fd69ae289495bacd93e6a8f5cab78d052109a9df9cf3aa8418710131d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "c8993b083272dc162f5d86e9359a4e24ba1716ca9dc7fcf37d6a68337021da6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "dc3d2e99476082a7077a50d0ec32f1379476ae432cea5246cbc45dce324e734d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "6e9f4c5c5baa1b5aaf23b1cba4922b2e8007ce4b53718ba474a10a4e380a35fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "8463be430cf585685bab84a9f3886a13dd8f3cb045ee69a766ae088e5195ba03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "fc85842966ca81c50eb1db334efb98a234a01cf421340044b8132a4a007b3b1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "bbd472f2a818fb64eb776d346b0e63ab60336ab0a9a3f455bc49b05c092355d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "17a2946d718af0a662fcc9e997803f3114a564cee1edf25db129165c7c005563"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "4388d8a5cb75aa699fa6b4f36dfe104a43d816a8bc5d3fd3a02bb16d1a0b43ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "932a0209833e2809eda7cde7dad23c084977b8db8e2df62639416eaf48b78d99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7b8b9cf6b6e69f15fbdf7abbb40e6abd5c34b2364f164ca5274917afe1cc1a2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "b1ca9c3c10c03b0a1c89050a2447ac764868d98795e49f9fa1373d147b23da37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "dcfe0ad875aeba3b6e42ecceb9e92ec5f1b96975ce4bae47d63429b4bcfadf2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "4ccb79037ccf735ddc7802cc598ab63fad081c78037f53401dcdaf4a0b7cce67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "3c84b966c4e81ea12a5d5270c3401b25dd398dd195df5e3102e942df01b2c1c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "8f152a12d459634ea4ea217c3c341b9b1dde06fefc59c702c651166f409ccf00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "54280411534096ebe6468d067587d86cbc8a69e6d974ea943bf142f410025161"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "82ec34a4d2830b45d3a2eddd781e446461ac0fa1df4855918918d0b47c599833"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "46ad47f4d38276c6691d6f12cc23f4f08ba0d330edc5afa8842a19fab9f737a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "1297399f7dbf5676e58738e6c0f5cd56bea024ebdb1663c3daca0d5e30ef57c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "ee91deae0963f621911955a305089f5ab97b5fbb22ba8c4f459fddb25ecdfcba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "96c7020947885c7e87771622b60b2e93010cda4dded681f839f0fbca801669bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "0818b78269d3131488ddda042ade0aba748731db514fa1ecc8d4dba13a4caf06"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "2280b718d3312cbcf4b3a42de0b4e9fc7f078c2556f5a2a288a7c81c066b5c43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "3490bda7fd471c4803aae93a0d66d5f72cda7f7de459df7adbfadbfec7609848"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "a53a6d851abded3d7b1880e2cb88e4f90cfb629084c25719363ae6c0fc664a6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "309fd91aea83359ded6813824819e8e0277cc74032b70402f58e5856bd3c3c25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "d976b098b552b8b2f71561898842cd0d526e7aca6f3949c8433e9ff11b08dd0c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "4493ae3274801caae3dd0264aa88f6e5100dd25c22c81a4f5cd1756b02a1886b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "473ec98f20e34e0855b2f8017459716a9a3dc3261ed5e6959ea3fbab5f072df7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "130abb191ce4744e6f8ec1cb0f9d522c056f63c02ac534d0ba39b4551c2ba9f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "fff66faf7f2a84426d6e7af7d22c42e6b5b0a9aee7582c83c2109bda92618563"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "2f5f1b43ef532b60cb651705aa5ecdddb2373f19ac3db43170f48dc1d5caf270"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "b934b3dd829839dd678b835c98f2945c9398383e4c1b05303ef5218ce3da4e08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "cc12a2cb0b59741b26b52caff03eb7558a4ddbbef89f991ebd97439646677730"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "9c973988a2e333ee21b78a71f3dc0c6ab4810782801efdb9edd17c13c264863f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "896b3a33e36b595d3c963b62b4b6a597d0fc2494a2bc3694ffa23e04eda8803e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "8c389145d82ac28400e5395850dd4badc9351a92233baa0f9bc157ae643ff5c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "cb2ecb511b930791e7b695e7c2207d6c16b0e634f3a92f2ac32054042d700a44"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "4df4b4917e0144e6760acb0f78d9450ddd5282ef3389b915b046e3e88070908a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a660f250e2124ad308aff8986e37321d4ba67f1f9dbabe97779f7ef53e2053d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "662daa9e7fa807ff932ebc53a745a510445b2ab5beb196e902aeb7765ac558c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "e79bd83ff036e6256ccc88e1cc512ddfd2b1c3920aec19af3b31d192fb0530d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "4238ec8ab5bea047e88502a3a80fc168d85883c4c4f0bf3f2f0126368c306aec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "0e0e93ef168254a47cb67798821996491de82e2a85e14267b6f912eda6f2fe96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "44f0eaea7725db316042ea6bcbe0b36617eab2fd182562c63fd92d4a5bcddfd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "a030305b673bb352eb2e132ee48655bd221e5c4f0f9073e1081fb1c701c693d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "03da3ddd4572e839a5debba5472ddfa353bb2ba85d98d1b1d13bb212311d647c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "04638cfa2d0065a3a18f00d5e3187fcb56a627bb8a19b13d85b9c3eca59c7214"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "ca1888da1cb9347982fdf9f90e9f6cd88245c3f2a16e2e84066ae5a1727d3c42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "b6a327c682fdcae5653af26f0e91ce9f7a3ba44ff27acac980849909745653e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "f5beddfaf339d509d3b96a794f93b950c3b90c8c913f5aad6878bc04632e2179"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "8a2cd6f3dac4d99063351cb9de8ad7b34d1c5ab0442bc9b5c575d110ccf1269a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 202264, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "b73845c4c2e80bf75eafeb784f8f854f3f0bf1ecc4ca5a74449cac4ab99a3281"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156880, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "d089fcf0d040396603e1a9336f94f883b0395c74b85704259359ab1c591ec61c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "64568ee683f4a96c67fd79eb8aee8b660344888a467dccaf277c4172ad00a8a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "71b2611312a3f4cb739295d811c29d96e29216d3dd1e410df3530d95ee60d62d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "c4f4e5aec39fda0f8f50934f48e6d0c08428629497ab069cd0fd0ea29fd52424"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b4c70bf1491253c973c09adef3360517e787a8699f3f7f84194747f7265cdcd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "08af8a7782279170971825a99aa4ef461a24e253a576539bad54541a3cc11eda"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "581a9d46845a680a9790699c16a6da68c71cb7e7658e1a85e294a0fdb9a2c0ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "055fe1f98f1c48260c04eea81929448ceb6e24b3edafd9f6c7ded3fdfc128127"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "89c1a09477c975f6643a284c34d0f222cb4f9878fe7ec66b09a10106a6094bf2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "5c62aee4c068b9ab21c435661faf4e55f20168f37e3249cf53120fc371609bba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "4f28baaba4e7caca7e6533764684c1dcc8bed9dfa93fd6955d75a497062b8261"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "b3effc70a648fca2bacb0925256d36e9e2cc444b1674617b14f43d2c7ef9f965"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "d963ef996f04f2b5ee906f9e332e0dc801594fff44a23cf117d85b14677f01fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "5d03397fdb9a88a411514a7597da361530434152b3d2ad17a3b77ee9d37dc194"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "2add522ad0851ed5b052493c994e7ef3f3324ccbc6becf30a3530b6a7c0e8062"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "314946aff1e6bccd88666d9047bbb562c3de1ae476a57b1a23f471939f0ad4e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160096, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "18d55ac990501bd49e3d254ad0b77b1c59dd46c3466ba59c0764bc9a95fec64b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "cccd65a8e654762062775509b1b6a13d9c03b0b2e394448d8c3bcb7c4f2e3052"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 171360, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "8cdce1944ac9d56f008e030f2dce56d8d09a5e786700cef956243411a1c85948"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "662d54b335a3e54a1e9b1e1824608fd9f5425fcd7917b4854b749f81b005d837"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "01fc0ffd3149d9c6c86b0b9f9481f1503af27ad4bcdd39a80d91f0960677f6ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "7ba3f1b0c95816dda249839a83cd42729738e18ff6d6f9c106281dd33a0b6b19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154464, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "b3dbd174842040f101bb9f38bcff85bd787b795f5a14fc3f3809ad13f582a0ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "7229463f76ceef404e366ca4f149080dddde1859e9c99e95eb36ae7456651403"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "b9ca96fa37ba862021e262020766140800f811a643c22d87aa6d89eed912134a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "569584300605e67e14446f57b55013fda53588bdacafc9745421f61d4eaf4f90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "7bf9ab3c5d4525bd38dbfea36990ea68ed93c385ae0c004086de627264850bb3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "35728bf3b6c287b4d8d81170751c36279c22c1685de60b165c30c8d65db4b7e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161216, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "19e2868dfa0299ce06628dded0e4aee29630bc5eb41b02107e92ead597d3b0c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "54c4cca9c8bf681197800fac29088822fe6f0ca2fd0b8655aea4ba2d05f0b7c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 172736, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "4f15f697e9a68b4d9546310d61196f885e81f32ad6f0db1364fee2459e6a578a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "509bc0465677204c368e3e92c41b8b18c5671d29bc8745494690abded30c9e7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165232, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "a89d971d9e2b98b7a5fd28f46686d111ef1a7fb88383ce0ac2c35d97ad37862b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "034a44f3d70b82f7638a826f7f967a48afdb6a8b584f6c3237c640b8c57b2a98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155456, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "852f0752a7eab6b7fb17e93e433f19d0bc2c0bdeecaa05e0e161e73a42fe5969"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "4492c7d164ea0a8a433ca65eacefcae71aa2ba19da5cf7eae5444266c14415bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "a91e449008f942725375b82008ef5a25b0bdd510cb341185f586bd1f4442d036"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "8f48cfdf45a6c5041abd35452b9a318c67dad4fd280aefed4539462a63bdb39f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "351cdd0c5ece350a0c893d1512b8b20f04fb4a19082e0a1d33201af476b76385"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "1abd1fc894f777cf3d90bc47826ba455ff0b7eac1517cae7891c53f4b7a1a959"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "06d9f1d1c5be06f0a1d90fe15c6506ae5633917280dd72641a5892f3dd48a551"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "b637b80d58a4644b7e30594567d22051fb4c72b893527214286d674689c06552"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "708ac0fb5aed6207afabb71bf908443b9dacf5a99779eb77996fd282c0d09ea6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "d6023894aeba026d1f38bf3f3e4cecaf0781058e2f8e7272007a444415c79354"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "7b8d1ea5cda57189de64052e51338de7c37b41a16257556d17caf07af5fd3e55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "ce75c0a4d9c3ea6fa1f21a0675424709a3d12143fafc60b83403fe9c1e50283f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "d9197767ef573c66eeccc273eb49a18aa7be6861f8ea1040e981de3e5928b1e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "a8507f826fc09d4743b775cec40b2100dc07ff9144b236b9042e59a057ba1ff8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "229a2cf7bfd6938b5da892beaa2c1ca93c3329f6ec528e7c8a418350542219de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "6cc4cc75c71f7b6f92710ab6ef243915470e3e9a26e3e0a9eeb8ca895cdec51c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "f1d98ce06cdecc1a2a8d79856d3851722e9dbf2bc9c3bdece0701645cb684377"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "53089adc2afb7bbe96d510fbf2c580271bfe75590fdcfe875cdf6af98581b1b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "b54a9bb4d2215e217d134936e4d24cfb83754de50d01d2f52ae9771b378c8907"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "3dd35ff943a35269e4e2e844c786bb561d044b85645e4611d0a0fda2113dd178"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "f3d4e441b6b10f653dee24e36093b6387f034aaf24eeb1374b1ecd195754fd99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 202264, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "9991620c71c155a28397eb968816ac93909d8cefb9856bd1e04802a8bf53c779"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156880, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "d89957fe7e053dbe6775ee1997cf44fc683ced4272bfcc272af808ea106483f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "a606876249943a63dbc9fe508eeb94edcf492fefa343f92904b7c3bdd798c958"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "9f2aa41c6fff922aac91d975f344354aabaefa3cf443ccee29662903936d2962"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "cd244f402d4d4584982462314766b7c9757f03a0a501a514293028a04748638a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "b45aa0be3fa1fc5b52354dcb33f3070ecd3825c88b9ae3ae3b70491ceea096d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "49de3ae69c9d0d2d0ad559d131cfd3347e39be1be59dc4f8456f38a31e800845"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "f0bd84e9fbc9a530c15aa718364f92d7b58c249c632f00bee0351c4894e57c1b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "7073f6558d2d5a05db8154b145cf7420aacbe351496c32cc0d8a79907a134b15"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "9f57f2d3d858ed2b29b569f8f0094e6f0f548f911e90f72f75897d775eb338e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "829d5a8042cf18ea05feedc4fb1dbf6b69a654ed8b873115d0dbd92d5b66c181"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "5fde659abdea379cf5f6bddf1add9b4f88d5e0d56775d81d340e33fe3be6a43f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "164e7f6721044e835f15a621b02bf8c4570d08206c52e68e107c9739a52fc047"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "df689bddd5b5ed7d8aea26dbf4f81fa8ee019842e144c17f12a4258376bca844"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "09c290a40bdf9bc499c79d9ebc1529455357f41d2e8348c661b8767c6eebbff4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "551b1edd428c9af6362e7d49999e6b388918a3d31c2dba93ba17f8e5bd65daa9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "7584aea816cd32d0e5c041f79544e7d8b6203d88c8dc271e4588e9a5eed04518"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160096, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d9fe1d74bf5ea23947401177f629bc0d4023ba0df5946a6463d75cdc0e4356fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "2740c25f07d6e4abb866b7db459a89669ec165811aca6cdfb1a12d6caefef48e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 171360, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "6e2dbb4baa5f6dc478e98fb21101a51a2ad32eb7a716aed61767456a6b6b77e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "ed3d45e4d750822076da39638ae81649c01d1f3772dfc6764575a409c5dcbe09"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "4f9c72543e67df5442c0dd1c5c3e1162e2ea4f1bb10fe7d2b463f229a98e0c5d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "9061237b43d910839e77fe2cab0ab430e24716d42446934a04e0799a2d959526"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154464, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "3cb696f5954c7ab0dd8733cc4da0ffa78e2a6121678455eb620bbd627d7ea318"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "33937e9e687486dece6d769f9748932ffe79985dbd442a9e7022fdd5cea15521"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "535359ef7f0b095bea92fc86fd8ccf26bc131929582a1e8a0424fee7ac4ba42d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "2e0150ed9b422451b8240e0b8d9d3495049e1421227df10154b367d92e5ef867"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "f2140ba0dffa96165b96683c79a626a58b521485fbc193cdb1a389e69f5ec432"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "7dd4ada1c2af69deadd2069b039776601a49195fc606919b085cb8f1c4e169f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161216, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "b158e54d425dd7b424a9b888be446bb0501a712e3350629a02f6f0fca7bff445"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "ce93b0c39214009f53f6c2aa29333a0466f49e0518a9720804a6b17aea57dc4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 172736, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "0edfd32466d6517e57c5fcfab2cf789f3482f6c563d7596a144ead447086aed2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "1ce3b7de11213a68788d99d38c42cc1e2f87fc76948ef964e4e39a422ee1dc62"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165232, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "678078434600cf39abfcbc13845057d4fb91f19d69594596ce26ba92c5bebddc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "bd208b35b86228982e729ca609928cc8ccfee542926aed5927dc25b9ee501369"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155456, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "af2f8d46466ab88ea6bfed568039598abfe361971b05757bdc1e2f8f46dfab60"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "d91c8fea58a47876f427522558e3afcb90fcba0e454a1d6c8ed5c977e4c6f01a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "b47ba4c483b80f7ad2a27abf5309c50140cc44940f0e2891f8094ad47c273172"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "4ffa7021161f5fbbbba8fd895e39bb966b49146beb2a05a8cbc51fd75cf4903d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ded0cc5cd4c0c4ade7cb2cba2ab0d9cd38a70332af1f8b290a20289057195044"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "6c595713b639f627b04f4cdf38c8477524792e94f9e36355fcf149706b1f24e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "fe9345ec3e4cc36660d62a4a689bd4fad60bfaa08cbb267bceb112d028ae7534"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "52a79ebd15253a4e1e47727ddd19c2219a1e531b609f8ca0d83ca9120e089e1c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "1243f4814da0f924191fcbd01be7047bfeeeeef25618fdb9eaee86eb550dedb0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "b077c4bea0f3fbaf2a59f92e878d6d0d038a91efac1d3bd1d89217e355cac012"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "20f12812c863230fc85fbd4e0aa6d5a78744dd8a8746efd8b85e577cc7d19e6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "cfc7d360d4556561649824c5f816d803c82b85ff8ce88425c00bec321a60505f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "0c06ebb742522913fe5a1a1a8776a79c4b9f17037a12961fecf26fd8a4128272"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "f7b6fa98446b610f18259655fa73af5cbf1480cd441e9b72109fd110f5f3ab64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "11ad1e667e9c08e9705100ef2e8e68c710ee4e9bf645512eb3d748df3b49cf20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "d1bfc0a02f3b74dea77970492da5a25b2745673bf28a4ec5cf300475521bcb68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "24b31c14d97a00d74915b87999d3268c6c309d5695b2a4613b189d276fb7c4ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "0581a97204dd3e43790a8a2ae571896efc86a123de2efafb7ef60587cb307f7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "b03bb8b5fb17f2d9e10021f09db239fdd9e86b97b84f4896a2b6444c602d7998"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214224, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "e33f83d21383c0178c660e7f285407560ee4917ff38c26232deb8f1c6c65df4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "1624a5a4534d4e40e2108d856a65f7bda5a742961a087a406715b8fd5dbd7965"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 209944, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "0de318432b8b0d23fcbfb8201e4682692dae8a53c7e6cf478bd53a4f2995c0a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181456, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "3c8c98fd7a23d71cc0189996beab4598799ef08471137de969232ffcd42ce341"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "446fc16285d0ef76d80f8324f01a4192a38a3ce4a2ebaa79687ba730790c3e10"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "d1c5bd355b71f8eb70cbdbb25ca197bba2297d73c6405663e1fcf35fb81db332"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "ce74419f1958f29d8af27b7ffccb76fee6dc8aa16376a22df7efff7e8e346d35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "81cb1c8af12e765b66536f4af9313662719183d5bde17ec5f62f59ceea45a04a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "3391e551359d377fb12caf5439809e727c17f7cf8c3f57883ef17aa2450c4515"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "502021eb74058734942c15e6ecd1ebc3263bcb1a26475d319622f902fa73318b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "019b7268f96e33e9cd375541177ca2c4cad487b42bc97636ce47114905f72ac8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "1638db0c97c4cf9417294794486f97375e6a4746e6ff49340e5b4db0a863240d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "3e78234f9cb0f86828b378e414e2c37a28d4ceb69133399299fe176eec8a0652"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "4d718ee991e435817f2fc08746956277a5df88b2bbc6ef48efacd3684a05ca01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "deaf4c2b424d0a0f9fe62a912533595cb557cce3903bc0fd2b43699707e80e27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "bfa9e89005ac85b2fb72138bdb712b4203170c98b3ab27269178692fc5e7e734"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "5e6110945f88be4b089851a3264c63a4023324406b0ff18fdfa5c32ca69cd277"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8c879abe952dc5dcc00c0b1c4d51fd07f2c2688edeff6a78b2c416097f95ca3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "03bcc147fad7e6c6fb292cf0e00cd47256a743562ac9ca2fe6d933b6795db986"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "d62990186e5fba8b7dd9f22deb072f06b1deabfcdea164854e7363cb833f8464"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "45f45877f976b7aa1fee9a2c229882154d2b62489ea0a2c611b852338eef1bb5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "22f4b470939fa22bde9ac711c0f2e583361fa67186901b9fd07bc4ed587b6754"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "086d91183c2ce98a12e80e3df8be70d99b22b6b174d8569a545b588503e3f080"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "1c0ebb326425c3a60dcc809465b5f037cefb13b2a717f452bb0288c3562defa2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "50b45b592d930d346964f8560bfa7a1843018e873f1afbaf096635aa886513bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "685cd5b75c623092b9a71c6800054895df0d725da72192e22c083f42edfa118f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "bed743ab1730027eac1452095771dd92ac648b9ee31acbfefd70f12a2f936536"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "4d583eb79c5d9649ff3f3fee96aaee4c5787fb794c43844d87a880cec33f15cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 214368, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "e399429acabdeb9bdba620517d753b4efe6f3376ac165103a8908165d5d86143"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "0f74980a96d66c7858ed6a99147b94104f4eba58feae9bce7579df42ee2c7c65"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "e66f1b00d2550ec6b1bcceb1e3d25d0512cbae845dc1d7e1104616081b30f90b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165312, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "360abe1adf036ebfa58d67ea41d5cb2a9fa10d70b2cfde404105bfe4f6ada663"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "754529d26b1c7b2d8e11e21763b994985a091ae94c773ed966c12a7fc6579819"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 180928, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "874602a763489d0aa3228cdaf690ba7c0398a9f6ee8652ed74fb7a88bb8a6965"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "3fe036f3fea78a9e1188bca73567d75816f43c2c92b9eec8a6cfae1f219d9c33"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 181600, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "fe04abfce95ca33a31d10dc5c9fb1f33bb99eef87a904affe3542ded9a0cf70b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "a34d59a56817d5852c581072e68a106ae665ec4096150841e34a441bf4ee2dad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 157504, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "7c06b7fce72ae3a448d2be8ffaaaa5b7964543980e4a9ed58d7d7d6d8e26f7c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "93bafd57bc7269246c36164c44ba856782da85345f95d1210095ee85450722d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "b1480063ad0aea4ee7701a9ff4e6a8401ab46933e887761dd7ff718ab014a9b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "33a03d94bc71d767a5689a1892e5acb3b233b3ba49fb07c4799643a0872fe4c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "1adbd754db662901c97f6b01c376330fe65e495e8ff188875d4ef6bb4aa04200"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "e64ccad938695e4e6662ccbf443c00d507d3a4338a10c0fd4cc3812ea784954a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "b5e3353ed1beab2f62a6b8902c22af9e38a4f63ba54adcfdf1ed604e2b5ec206"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "f28659fd54886a8c0c7131b4971fe7874bb2f708414dce3352be1a1422db1ddf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "b8546fbdc76316c0e6fead2bafed5913b15d59f814b17e0dd642d424d8781471"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "17e98ed4b6aa3d727c3d9ffa8b904095509c319648f53ac4a9e3f973c1712345"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "bebaf2f11d067e6056da10b22a0ccf1224be2b6017fb45f3d539ffe59e204e16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214224, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "be9d47d6e19089cc942c4987e6896453eceea7c1a7a0181486fa042651063618"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "94cada9f0ead53353652b41b3ee9fbf6e897dbd93f787e4d74444186e9ab93c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 209944, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "0f335826f00383e1a38b32393ed96ffadd03d64c886c13e5192e4de7f542278d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181456, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "9f66e88a5b698f1e7d3176a571580e758dd16d4dae6b7a6952ed6a9b0259369e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "9b6890566673cd32b43e867b4166e497e2e9a89b6987a575d27bfb0acea04194"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "cf9bda3267792a384e6f354f659d4993960e215bc7c0c51c1e8c81eaf789ebd8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "88087baf354607aa559efe0f69e268fdb017ced71bc516e9da8134a1bc7b2060"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "fe4c0c6aa380f1d72d19649aef82ac774ab9087ad098d788ad1d0bb808cd261b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "af57983ce91beb4b7014c739cb64626b8ba2ea2bdadc99cb6d350d5d6381cb5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "6155360709a4af58f4f3264bbde696b2ce65b2e130f326f6e2317b317bbbfef9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "2c5947d265f5cfafffec737b6aafd9b4fd4f588aa20a94ec7609a66604207a17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "dff0519f745f060b715baa82c5d569630ca1e06765e701fbfbc45184a0fa48b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "8403114f3ac7cb74743b1cd73162645384ed9af364360acc9463d75b6f5d9e7d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "34ad418199c574320a78e01968a39d53c0ba448e8f929bffd629eb40387efcfa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "1fb2ae26773d6122657cba32535234d54c2b31a57843f0e64e7dcfca697a8a7d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "937936c3b973c05e563104df57e50ccc7db9306572b312a1543b08f70bb2990e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "28eed321bdeb25efbe8707e49c858423721c62594c8d847e8f3e96d1058995d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "9027f323e8f56838d575b3d0c2a9872a971bcc19f7fe3803e69f5130c7dfaf57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "d1a2f889fca6c9ee33450ebb1550d4ef31983e3b3a7d4482eff468465fec5de8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "73ecd562956dd7f380667122f566fc375e0bc66fa8f913463649df6a06e77347"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "8b78015884c94ffe2725d2e1aa0fad98dbaea0e4bba8a4cabe5a233bb78298e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "e148e12814ef3f3276c69ad5a049630939d108640df4e7168c0efce751318d09"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "2bf1a5ff47be406dac95902db3496d5560e822d15e3cf6aed86a3bc7150ce18f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "5bc1af2103a9a8b1597acb5badb4a39a57be4c864372e63757723f7adb3c21fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "67afa504e47c80571ab2fadfbbdc513c655fbcec2f63ed97c36563d0bfbd5e56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "91281fab60ac75457b3c94dc053f0c005d2e7f8ade446bfe2cc5e0cfde7a9f4b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "06432038aea4151ebbe34d2f0145c144d0ec2d56038ebd8af544f36ca688a8cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "89622b0b4c68bf27f7a5badceb379514763c6db72d8d6dba65620bd498a94b1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 214368, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "f63fd7529c4624767bcdb97eae3f371921321055f9916bc031e8ecd56868ec3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "c145bc100095a333c1d849d8c0ec66cf68474f8aec4f4e247149373f381f3804"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "4fa66e0da89ea84297728a3eb4bd804671f32227cce3251f54e0e8bdb23a18cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165312, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "cdc15083cca63882f48d7e809f53639d0e9b31c2522d7bf4960c0a65abae0647"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "279bd47714f313c6bbc98b756413c52c4a3d25ae8a985147349145d1939878a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 180928, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "71056a1498087734696e1859c49de138a3767f6a816bc1d5d2bb97fa33990eea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "419daa43603388085ef80c86b269bda6e7de8fb7dcc182d2d1b6392f5acb1145"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 181600, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "a96114bb2ff8daeb8b255316b082f59d06f44935be53f307c18f7c20c1dcbfc8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "f6d3df3b1471b87e3aca010f3fc07a3c8d1ef20779ab3cd10747e5026b4032c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 157504, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "01583e9465bc17fe972e5882ab80f81744eb3a91afe21ac4ed2b2b00fdabf494"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "c0967ae995e35f968da82d4bb9ba4f42a5d85a5a715c3832b2d5948906d5e81b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "c430e9336d8e6072e4f2b10a41332dca4da6254c17031a2197bd0cbedd26b797"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "375c3dcfc513e4e926f9a7a67aaee56cd9ff099e96312cbbfe2cd1daeb17a149"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "38a5aaea16eb39354a1b1db5d51be3b16aeb4e410987b2c29989d4dbc345e1aa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "6459298cd9282697c3534d7c1b8fad1e35e25f1ab77138422c358d9e1972dc79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "0d43466e6ec1c4d3b2789d41f023e637ee2ea78a3d65ae2e79da08d0cc92d29f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "fc71c3b7408a806d8e34e247b5d0fb9d3620de9da44b2fd1aeb0d2dde594b410"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b780e083ab2f6b9158169463e28d4735acf65143cecf1cb65d70ca8fca724f58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "bf6473733ff2e04811e215a365094d769e74dd0583656c40ce7289d45821e094"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "054dc9f6c163467a9d6d9ce045ae1daa90e5dcc6f4195b85ef3d40550c0d5f38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "cd6418921845419ffd5bb76b748134a84d8e45edcd350899736ae713eabf6747"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "33df4a3b7494dbea1488fa83095a33a35908d5cffbd6fa6d624140ef09d929e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "b8a26368b1db20afef05976ed18bc448d9f7281383caed14e17469555fecdf47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "ea473dd64dcf6003eeec4961f28a510333263fb9a7fc9a425129615bc9749c8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "a7140a1770ed6ada97e5c6743bbb0090d7b75a84ce89b4106fd1646766fbb012"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "c98af238acf109b6e2670691d9f9251217f356722af333a86e1148674c9d1ed2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "dcc1c3ffa3f163ff27dae5f94c244c34e32cf8508ce55cd130af4a804f7a5690"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "591aef2b1cf00b4b08956824d819fffcb7c1d9167986ee155cd1342752b17a13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "853c175a9fddf960169fcec4aa6d9281217633ad811eac9cb8e469a24378e8e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "5ed3542801823a6042665658ca1d93a54a945f132c14eae73f635164efab7d97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 199336, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "8ac52def1064d92b4e31ec09f99b5c2b2610e9127323fac4ebdf629c3d08cb5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152928, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "9dbebf6ff80ea1f2c81b64f79a07ff72a5ffc0d86268f39b56925317f17fe62a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "083af95db83ed2cd36655f3b9e3728c10bc99cc5eaaf0228e29238023186965d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "7fc1b506680428b66cd51d2908141501614b17c9e508e26cb02a38fa88c9c1c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "ecd7b83bd2d8debb164f00a2c575a6558c60a4b1b6d3face1d0bc6548459b0c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "e10ad089a76c9e039f8b3fd5e6f8257a4f35a769c3fede1940de38883b60dfb6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "90953a85f3b780d7a65497c591b5bd07aa3c2f67936e81bced6bbcc310784488"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "2cc1bec441360e647b5969bf9cfd009064b606944b6d144e6c9518ba7c5d970e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "36ded7f28f198c60620804ca809f70a5496931affc11d8244597ce46a7b8dea5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "cf916a81f284bdeaae7ac6e5742e15acef950b6e75b48b1e33fdad26ec498518"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "6c3fd92ac3ed16f9b4fd9e39a276b0642ec8ead1e9d859a22e8a9087ab577c70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "9b6e6e5380ddb7cfb3a0fc8f9b1d7e7c41c619541fd437a0ff9f39f48935d7c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "6927458725abd75e86f80ac97e6c9e27c7095e9483867756ae391fc1c76a203d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "1c978b8fe44be1b13a22296e806b6ceb48cb925a8212c9da4a6d1bcd923a8c9a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "caff9e9b67e0f999133fc209354b75602263034cc01d96944fa0f537bb65f8ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "10487cd0a44d73f6469d41f7a0fcbb05932c2326f4303671b337f57a185b859b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "4d77844839bebd06bd7ed3fe9194e01bf72d45c3541ccb58895f675a98d9f1a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157168, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "7a6f912238ff79363e1f8785de23b6442937e2ca9752ba23d09dc0b994c71032"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "017b1fac78833ea02d92e746ae2f7bc38ad069d41dde6f3334e14318aee89c01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 165360, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "eef396face0e3a4e454f07906059b9952aedc0145e11256f36adaacfee008014"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "109117b0cdd1404c890f7e634beaac76e1e98b12cd23c76bc3cc3220334a769c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "46315d514335ac38c130c9496b368fe597798f458aea87c7ff6336c48af32002"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "7a19227cd810c02a8af0de51aac66da395d244464236f10c02dcb408db077084"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153072, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "87d726c7e230293317451bbc9b42d028acefa9219f91c6eeffabde14002fa294"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "eab1104f7e52842fb948f91e953f987843a2da602a6628b42e5557a1cfc942d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "63d7ef5e628e8cf73e289834de865a7fe9d5239074f36b5aad6188336851b6ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "ad5a78603afb2f595600ce7e168095880c6c6de1f9647564ecbf15ec86f7ac39"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "073a3e55eb5e51a1d97fc2ddd8017377e97a0d0f2b9b43fba5a8e9caa3e49903"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "fce1ed3dfd9c2619b5469d88b473c33ec9384761aa9b20e7e5920ef23224c206"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158288, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "bae359839859aafc08a714a212ddee184d21546bfd489ede4707c311cf832a55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "0e2421d6592151575883ace10139d0fc98108c8c55369626104aafb8632a7eaa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 166736, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "e85a1f513187e31b635ce79088727796d47ff886dd99e0c4ac386e7f76d7ed25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "0e98968ac5324b8db8e83a94695494e85bd797215ba1973c3623642aa4589c16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 157184, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "06e9f4485e8fbe6b44552c057c28fd8e9882bbe33992184a55b032593e0755ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "0fa297ea4c80a8a5f0e0f2face023a0ed24919f14ef858560802b9c81b930fac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154064, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "625eba166b0a0048123b06b093720b751aaa38109189de051a3c3a3533b75f38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "6906dba76cfab4e67935904d7353a799a5249ea4d3acc5cd2dbf013cc8c18f8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "1259ec40201c79c0f0f780372427e49f2c2b06d2563c6bf3db2731d991e97437"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "22bb8420ff807a6047eb7ba127b686a1bb925c837b8b7f1d02ff0482c4d8bcdf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "d28e7dd57aa27239f7280bd20b35073323d537f8c9829409aa9630042ea950c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "0e58a06834eb9a1b76a79d94c41c9731e429ef9b454c6982971faafdd8875482"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "1ed516fdce109a1f804809cb515be0e97b6cc62c5a4debe9ce972ddb82125dc6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "b71f0b22d3f4d4ab873297983de290c94dfbc5b53bf0c8861b73f6376b5fd333"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "9067d975f9ea881a6538382b6b624efb28a6c1c3a2ed5cee6b6b24ef981fe974"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "1911a2c156f66cee407cd86b91dc09efeb933b53badf9e22b5af0edaaf2112ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "24e0749f2ebb5d1056b32765c51c31b723753b1e69a0ebc461d716b426a98702"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "679d76b36b36a2d3e82ee38bde9cea7da11e2c21475911f5b13bd00451ad8f71"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "c33e58bf9943db670cdf6378072dd65e750a6c6b63e0ba6f5d63cdaec887915d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "b60db7fe181d66a4d730589433ebb528210ccf5836dca15065e8deee0b50b295"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "81f3c9d48b6d8fea95e793f033126776f68fdc50e0e9ea7cb92546ad5ba6a5a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "21740154c8f3de200a6a8a472234a4229c06f009fdbbfc7b6f982037957d908e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "fe445c689060cb6e67e7805f1a791dc309a2a988fe69db8785729c67e4906269"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "e89be444ca4cd311bd5c9cdf0a31716990b373881918ce32041e047112d4f469"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "16e407d80770e056a0ad77187506cf9556b9d6c4a3cc2e4055849c85ad6eecd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "2d4626165dbb0ce2967cc74c5b673f00c8b36fc20fa9515be37128e0c8007f6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "fc9c7ac3d64cb80fa66a8a360f72b19fe465af8f86355ca6b5ff52c8b727a8fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 199336, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "521d7ff4c26bf53602b061b1ebdcdaf00e2ce4b4883b3eb5ce5bb262c45444bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152928, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "9a40e1e4caefc01fc0d511681ca1e0e584d701a8ebc1ff4012ce29729b58d167"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "70d1dd1f1c0d1e4af22b280f1e901d1dbf0afe888b30db8d32eba6d03fc22c11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "a7c9a63eea6a95643fa8471dc8052b3f3f2f2c124dfb65e6b5478c65457320fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "56fe67ef620190a3d8bd13f4976226f8d90176a680303d667131daa57174ad6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "6221a0b42bd1b4b8379f2c3f1efe5e05d98d69e299268cd1e1970209fb4ac6de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "8f5a810e751a5785d2bc7bf75481fb6de0c74832f39f34a253a9e8afc167453d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "91a68b34d96e8d632dc2f227dd42cbe82552393a456aa23f052a08919b6ba3cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "da9871aa11224e9f8dc0011460fae70cb463d1b8840b16d66f01fc85364bbb8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "4c27da3038ffb8eb6e7983147e58843c4c65a25d5cb85df0ec97e6e749941ce2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "9f90a1a7dcbde9565e8ce0e4c193416b8081ba05aadb3c6080eeba0082fb177f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "250af3d87585f096d79e55f17202d9641852b5093cd7c11cebe86473c8c2125b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "250f5d3d891ad36361a41fe522a2e70bf322bd294d018832de4d37c3bbbd5375"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e405ff749264551ad48de1cdda2117f7cd8fb11a2683e2fd44b63a1180b13582"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "8953ccbf45c47a89d1e1a71bfd57c8b64f2be2683ae4956ce085e6df6ba7fd0d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "8ae031a7006e5dbdcb6dfcf5ddced26a0978ff00d68b9caf7606dbea18670b68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "e9e66354aa21c69d6c1e967bf548b8a3b9064652ccd56a63757371383d24db53"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157168, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f58303cb4c9d6040dc2ebf33754e80536e1624dd66ff7bf0eba8d0d7d716b695"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "e2b297e8e857af59a359e289b0113a85f6c1586eb7449fd0a295c48f066d040d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 165360, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "66825bfa28a4d59c9098bdeeb1c9ecbe23e4430cab086ad8cafd69de441085d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "780198044021c187609ed5bb5a6b17b72e717296172896eaa079b2e003cb9f24"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "cf2464c6ef628820907e06ed2da58e88243cee0a7abb0f2ea00af1b4619052a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "9796c738534b11150625253e4ff289a6aab113e7abdafc4025e78763828c53f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153072, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "80395ea5e0318eccbb09a30988abe907469c34903ad4b3f8437ab52d7faddae8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "d22ebb11205aef0ffd9e7154bb884ca478133413c3875eeba5f6bd3f576b3fd9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "95d8bb6211b61715e9593b134d694d1c2187e360f8fc00a4b86155444ca82767"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "3b75120fe7f307c831d970cb921e3d79556a44f895589147023baf6cb7db293c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "556e2cfba71e52fb38668a769e660d9948958ec15ff9f5592f611e14316797f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "8d48af93408ac3da631f308ed5315f0b40cc18e02a53ded5dd4cb7c49f9ced1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158288, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "765efbd9dcd439726f5be6cd21868d987a21c0c6f1c41c62df3a5deee470ca7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "ca65485eac75ab9de046cc0a150bdbb5b25b05153a25d4c146f0d98da4b813c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 166736, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "9c34ec39448a490405018ea3ef26dcadc5db0bca84866ea7add083d78903f1a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "89963f6507dbe3c102e08d86d6b49aa1c83e5b617259c51a26d30467a0437d56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 157184, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "de3f6445c1763141fe4989848e58c48c655bcfe92a7778c449a7696a1b5b2817"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "a6e131cdcdbd3dc5a90df9a6812d0b0881e5b8400687d30df87820f1474204dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154064, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "e7d6f8c199977a85f238b5ddfe27108eceaf980cc1e509db99917228becf5c83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "515d462cbe1009b2eaa60fc0e4efcc64c9ea842bc1b1c1e7ad56aa8a8bbba546"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7143a20297303e56f8900fc572b447d44f2edb4575931527d2cd7297b5284747"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "fbf7cf7771a8173da63dbd179f0690e79805c0046e9d3165fb61f38d5be52ff6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "4b3af5578614ba34449d0842a2b4738f9d6706e5a94eaf88f5ee83b09bfa8615"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "06749d021a1ae489ee1e7d9d41055ddbc2dbe74b1e6d1eeb62cf9446929566b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "bd74a1253d008150ec303d926771dc758392c57bbb69d4e00ec7a8f811a3d073"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "7414e15df1e4b2cee3e5c9ae4b5642d3d2b9867d68bfa669d21ccd06c52e7d70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "20a2a6b51cbd0df6b8226614ab4bd410c9920129d87539ee786fef86eb276689"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "90c8c6c810e10af13f6fa27e362e5519f61112c43abe277519b40f4ceaa2016e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "5946240bc25c1476fc95019e74c82c4af7f84a3408e09815c3ab154dc20d6377"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "817f3551abfa1cd206fb8521f7942f25751f2d225e6478ed80ac5718266187ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "0dd61dbb0e245d9b451620d7ac9fcc56b7c259dad51a557ba8a5a390a517aef6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "9429fc95901de1ded986cbc6456750a1cad76408943042a5d6300ca039b339f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "33f8418798084ec920ff42e046c1d7382217fd00b18cfc39c3396327743124a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "2c14012718d0915dd462b496ef7d8fd7f7f4fae84380a2563aef892ecc59a4d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "0b1c98ff159ab6ceadd0ecb18e88fbc8f108fd631f3baffe4d792039e1c0d6e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "f073aa547fce41a8afe7726b19f2f8ee498db61a386878c942275d6410356fd7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "f5ccb97a80fda0372e69c4aedf212e682a587953901118cd713dc655db422955"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "bfe21100ff414d51d8eca8efa9191b4ee8905192de95d124cf617af2518d45e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "18141f880d3fd453b962c4723785466dfe310bbde76459d855f92a9466548e5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 202264, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "300416f00ef9bd93e40bdd15ac161f034afb9957784fdca9390760a9f264a0de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156880, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "fe6c58f07d908c34280fb642b94deb90b00042630d05a0267d153f67ad85d70f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "7a5204b0376bf70ecec725d713cafc4eec0fb9067cccb74a5d3957525a86417b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "c55f230988c22771d32df09476a9acc4a317c91c8b243c21cffec83aecc93817"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "39dd65fcef6b921cb9619dd1326ef628a95e8ee8fabd5ed1b528f62c48b57d05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "87e4b798ae3d5dbd13745d544144bbc6b8cd8bb4c7f8727155402efa4ac77d50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "79f52dbf4ffe51cbc7236fa3d6c8350aafdedcef0e7a3c77805d10264cce0c1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "fd9489baf4a859e427227cd89d1d2dd7aa7ff39dbb3115907ba5e130469d9a31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "00860f4c77705201d6101a442f6f3b25ed5b20c3c07845def19811dba17f43de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "914649ce286f0f5d78a91705280e021e141e9db5288c0c631a18d43ca63855c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "6a1e2bbeee477fa171e299d6b1885aa566c73c16a2e893a615eda0343ee92bfe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "c593dc0d51994ccfea9becd0cf87f405a7cf09a68f7a67023b90a85bc5ecb81d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "285f0e2fcf6428e0d0dcb6e675ece517b000faf416b7d9ee33c6372461b83f6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "cec5b07f192f0bbed1b5ddda684140833d0ca2d2426f17839e6786a8a739fe07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "841ea71e8e54abb860cd0f468dcc713906960a762eba2c3384426c3d3dbd3619"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "ebd9405b613da127f03e121128a8c5067535e19b6db41da259f6748d3ea9d3f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "2d368d067bff9d5bde70284474dfd50e7dc88b1c7aa2b8c333eb074b97c0a660"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "468993ab52ca220de342af159bef1b9af4f61276aaab0887ea421c4ba089aa3f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "28650b930b639c22cadaca4499ca1d43eb03b9028bfff85b1ed733b7830a1768"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "765d7a4405414ad631f65ad9c3fd387fcfe90740f68c12e77f4e661031be2708"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "4e38a5696d39b504f37f05705371d6291b55c8cd70547e72d3cc6bb71cc3a895"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "5d5734c8b84d1a7f0da7b8dc6430a816307f8b7fbd3f4c3796af80d9f35a6f6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "cfc3edb1eaaedc7f6444788d053feb9da167754aeaf0b649eb5848f2d2f439a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "04701b3e93d6fdca10ae1f481a95b13a2d39e8a5e9a9742f73447b7802ce32f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "addd789b74d55b42fab2f94c4d627d830b725ae811bf3e06bb2263a3398df928"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "988705d22f25914b11550accf730543d5f03adccb0b81089ac62fd2cd5fe5069"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "cd5f6cd8b2527a18914001c102438a1c92b21b0832eeb32726648d4e0f2ad8b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "8ebdc421a4180bb14d8863533734f88f2d418468de3b7dce3f9208ae09a13dd5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "7dfc12601d143460610bc6556f504468e0385e12ef035f1eef04e1509e21d3c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "a8503522bd97f71d8ae50c95764d83ff8e41e855a47d7d032569eb68ed2855a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "582333d72ca02ed9447a3907cb6736f344b41f22045b3bfc5b2eea8be4de1326"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176832, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "e77819047dafa953233a7b1a7cc2cd0414d09fcf8d3016ab911a4c74e8d5beea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "06d614f9eb346d5e94db7f663323ad9705e2b7a53912d2fc773ed87c628d7a98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165232, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "26826746fd8e6193eee9cde84b2be955d54accaea027481b9d8dea75ec18b7f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "a576d0680ab2c5e9a885a5e8678b0042eb172e80d85785165c1441d24c9f5b08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "0b9c26f0ff8ec22cd1f20eb8627d352c48957f405545577cac656adb9beb7cf9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "b6fe175d9b197ac98d7a68a62b480ad0d5b83410ad493b9e4a999f71bfa12bbc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "33f8ff9b6832621cf44211918fa454bb57c6de304b3e00876c7d6f68ecccb9c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "f0092e9585d2208194ed0d06fac237c0ec893764d46edcb7c1d158034922ee19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "532ec927a15c0f3301bbb0090624390eea782bef886296b569808c06216da97e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "d9c8da0e95f0508447b7f1bed8e04735f8f7a23dca57b3807874a2d87a77361f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "7acd0f9ed9da91a90cc70f9b55b3680838da9a37e3c0fff75f1b1b32b3c7420b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "5cc58821fb5149306af349f8ff7499e2ff55116dbc278b797d73af96baf141cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "4ff5abf731140bdc721cff11bf094207048671e01925d0711145cf17d2cb40ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "a5411eb23c84497a2d9f16d3093f33bdeadb9d4058e487180e118223caae0b6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "91f166da42f718250b7157e020db836ca3e9da238a477adbed53a94a8a820e2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "c317a84967cca610ab2a36f86bca0fed7515ea74b21e6d268ec72f736752bf6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "0ad402a88970e6b13f7a5dbd71ea925a84834a0164cf12b7ec19882d09f8ec81"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "3f62b033a3529036f33a09c3fd4def02847dd7911e79f78ae9f6cf2b4bccb16a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "fe7f16776ee51d14a776bb0b8f1c8281b614628a12017a7d874c7e777d591dcc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "4d295f70d18e0b3fe6aa8e7217778e99f81f3733e3fd9e4f8d466865dc6d748e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "5b89bf26e4779c999477d10c2390d53fe70f254b66794cce5c5a4212504a46ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "05f5711545b0fa78b282e2a38f7505430dd0a640b338ce1e5fd3e3dc2a2a6f2f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "be6aaa3ff720a9c149a549a9db6e92a8df07e9e2101c36403b80d3fb5f189a91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "f5c3ae6190e7f98270a369b521e9ca851ff2cf7ab39dbd919f226d688ea44aeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "724226a7e903e3ca77049add145fd3c21932d48595324c7b83c4c9a5992e3402"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 202264, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "100f1587b9819e8796acbf2504137cb872394c382b4d4c251efeb98a7bfb1cf4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156880, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "eb4aa8b10ac2c5cebdd0d416210e3b53289f2c68576a482d4be42157a88460f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "c31c87cd00c94c28f969ff6fc8331dcaf3838c8e999a9dc9df370b77b742e53d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "df807400dd9c7113bb0fa6f9e519b43cbd51c7a34a79ddeb4ac75a65d0b54265"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "ba1748fca055f317959e14d33932ca8a32cac5e37069e3a27ed1a3e6bfd398af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "f90124b8c472f622ed3dad718695e4c7935c4d298242842ca34761d4e234dd3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "e0566a08450480344d773326c0e9d4c78d5d4997eda06ee0c091dc9b475edad1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "c3a227a16d7dccefd6ca3c2fec7d5c413c7e92b236dbaaf0ceb996b25c4d255b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "b5a2c03ef553dfdbabbf172c0ad6163e4a4b821c1f77658db6242892039fd20c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "830761684dc5fb62ba81687a777b4bc8055efbef8dbdcfa250c332697383735a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "5d9d9efb7de7122e5845b9f342678231c225153d5f8eda089d25cc13124032c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "c43046319ba30d0be8b11a14054c93bd8886e8967a14243807167b8063f05bee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "e10c051c25d416e1d10b835f99868995174caecec1f1674523c1bd552acf9a86"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "bfd50b7d10ef81000b6db2d0a4aaef829887dd7ffe0616b58e68307e30bbd127"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "6d1169406ccce3bee2349ce8c374b968d9cc0538eb574b46fc15113129999488"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "af7625d832340900c22d77fc1d5e30e03ebe8834a6d6c516a46bf30cdf1ff038"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "79477ad5e04cf3f0c295d79e0cc06927f02f0b5dcc3aefb2df0a3fc35fab2cd6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "cc7be1025911479e396874be503c0e7394e9adffd512d17b12dca6f270c0ede0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "e0f22ee5cd388ed0075ab939936c122e90a23f5f37254009066ef15d05d2feac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "63dcbf0e781c83e064ea2a6b8a8b17973a1b6aaf7cbb411d2d49ffabebc18d97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "3bb64530703dcf990ac47fcef437fc754630e0138bb66efc9d8cd46d778a0a59"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "3e0d009024d9a24935e36242069e62c97e97797b2a3d8d89a4d2b5c4f167fe37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "b547d0c2740b1b29f5e8f85a4bbc15daf7d0fc27a24e8598081240eede63791b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "5e0e01cbba1cd9f8091d4c7b9c2f1d1b3023592f20aa02752991db8d2c8a0598"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "6c8924cca4384d97147c9d8e963f7965865d4c9584f2aebd3590ea1162e7ab87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "2d9f1949fe05b174632243373d99f09d16a162070441937d7b21da3e94268433"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "31e73f307b27a79ac00c30b5dfb33d20d2f40e8b1f1e438164d800b6be59e2e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "dc357dce5efec13e3beaeefcc701200f75c5001bf3d4273bd10349a9ec4a1c62"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "9f4da4822cb909e199779aadeebe4b0589d8990ba87b391164fd3a4c7580dc85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "b85eed13129057c75133e06daaf6f64f0e310ce3f615c0b39af1cb3147de0848"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "2a710023d9bce6d82e9dd9d8c2dda1eae9b6e1cafab868b6c3a459b88580569e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176832, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "80b5439ffc3a32e006dc56e52630b73416bc7c842ff816c109e4c1e2d21f8569"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168464, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "7c0a8e3d5fc5d1eb13aa84ee4067931a5b9520947e8be63702a7852f8a758c38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165232, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "a91e6741240c67fa924c7c8933ec9148e4783e34d929f3ec08b95a4b1971287f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "2d9f590747856fa051f8a29801ec8662e4ef8f570785928bbe25508bf7c0538f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "7f7cedcdf287f1ef21fb599a2b8875e6c72ce561df5343adbe301b892d8cacd6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "650dc02810c56406f529aeccf0bdc903df52daeb8d3576aa00c59c87b9e84a5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "222c92d78ded3cf5a15cc7a40de747124fac61dcc954be182ca7e5b25c0b9192"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "7ba606e5a2adf64b83f00193d147f3a80a63bcef835c18469c124a9fa746654e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "03484925d4e8c26ba980d1cc79b43dba32608bd0666c014aa9c3409d03ca62da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "7b4031b2cea2e93e33e9f03081413ea097131e8d3eac8a364ef60efdfabe7f38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "2e5a4758fc49140c95d606fd80db96978713abffd162c206f2e3d15327455c58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "0f0108ecfeef6579ca33a5ffbab37a0311ee3317f7a0fe8e8be32be47195989b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "f2238ff9e515b0a3a40bbf008bb14764b647381097d32b6bc6d9be4af86a73de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "cf1b8840e11ce39d1deb4fe160c6a99fb7301f5b73c1d9b13fd9dabb1f6e9a7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "7067d614320ed174e7f44272dfe00ea22515b2d587efb93dc63bc883df51584c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "9a2810075ebe0b7a11a3b9b974252b8a35c1b408a42f00cb04e48d5ae3837f20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "27e4fe4ea33c48d9dc515e47ce8213b1e9497aa8c2d6f9b47b452d8ab62db9d2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "7236dd90ffb99b1edbb043db8734bbf5f00186d0ca98bf4ab6dd6ee324f3e516"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "b1fbf7ef9bb83a83d026b957015995b31e674f76175f65fabf94a696e2d8935a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "66897282b57a57a714b35a42948eadc824490ce23cd0955d21c760a0d6009ded"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "24cab40d96530d94e6ed32ce30eae72a243cabd1bbdb03342544cebde83b55ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "aa59fe3b7e157b7de1bfa275f910dd82c722f2d9820973071db3a7ffb7650630"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "1e3cb667d24d005ccc7cf64b84213a403dc629026ea687c1813b9b86e065b7bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214224, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "273be3705415caa170bbce54818a37bc57f515fd0cd9c41130fe708d119f8398"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "593dd6bb27184eb69f109f01ce3af1203a0cf76a8a8a2d724de68163cd3d1a6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 209944, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "4d29e6ba61325b05b983b79df6953f0afa485ff70c194b20ca6be23da5a6dac4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181456, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "b48789409ebfc74d41563b24457cc38e6fbcfc6a8219334674611f037d038bbd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "09880368c8f9974c91438f1f864a9655fd6c2616bafdee09db9151e74a5881b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "e3a37f91ef46c582312bc77cb7ca6729b8f25aaadd81366d60164f7ff1a72a00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "3fa6535d81d030597b4961d08dc5d8645284e66361b1bb9d94a2347e7e40cb5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b6f5fa3a8cf359aa6b9f505b86d9e41d6aa4fc59389db3722a67425e8bf9b73a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "be1101faf84fa0cf46c600d215ef771514657b82287b408c54e5cda0a766ebc4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "47b211eff162a26f4166816d8ec1d3e83c08a0a92f9d5469070a05ff7df3dd70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "1b14369c7e539bbdd04ff9ba03acdf27fc9bbb7f918e84e0a7ebaa7b710a3045"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "babfe7ac73e2f6bfe2fe1f3d127f1b3dbe2280d84ba5a8676f4da7fd63737c5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "f70d583da777c09a4dd4da56b75937892ba40c7bdde3141b39ce092d22c8ea47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "85c40d6f2eef346fa2a90e6f7ef54693ef48aa8ff8efb56e906ac66ace1951f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "cb3fe7fac586243c42e6f4f661dd0a88d3aa2df59570209fcad21f1a722c9317"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "4d75d6de39c70844ffa74403ba30f3fce39d8a4ae46ada09010beaf15822d345"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "239c339539ca8848da5f7736ba83e8285c262d8a5406fabc118f7e695e4b3a9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "cab47e5366ecd07b1665e8c8e1bdbb57f8793a77858890ecd4c225296366646a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "e55909bda9cf2848b596bf63498da0243b0a06c297640b1d91c03510624df835"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "dcfcf3d7d97b37938caeb111baeccc6f6a638e29388915a647c170dc47ad9017"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "6aa4c85f224a8c50a739e5200789f6b5ec1ef8345c0e5ce284e402f1a07da958"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "47ad9d4f2deade70b37acb1832cee1ab8f4f3e06c3dde19deae05f60c4fc72e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "740b69eab1797631ec7be34cbc4f4175320e228fa41cb47f9cf1c1c3a30ccb4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "e8a1454858487db5593f1b1d45a931ab4dd3a7cc747f825369b6f345e999a8cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "3c60a705cbea9805aba8dbb7bd770a4ec1c7c372b592ec08802a147f3c243e59"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "07fbf27932e527dfa930d57cf98f3eb261e49496056122e498fa71414c991912"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "50a711f5817c6cfdfdaaa5ffa0b7a74257b551b8c4fef9a542d7d83ea328b40a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "bd7cf57ab5bd467e91a7fecd195faaaefd1d802ac45dd2a2b25bd87d253a4565"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 214368, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "c5c1880d5a46eb35dbea87081befc56c28c31d69f9111a677678e0977083ccf5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "efff1b1390131450f5084590431e3ac28b47b166f8dc48785d7998d368477c4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "eb323f075bd999b7cd0eb8e52c902f4e2985e0b7f950dbc7471de4e1e23e5070"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "8afd9d03480edeb579f05a964fa191eaa4ea244f5a6ecce547e8f3f58094efbc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "5c0598d50869224358f50bf85f3fe11644e5f7362574520ad9b750f41b57b3f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 185024, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "4f0dd37964f8f75172da9a18fa6136788d591875b8c453a0567e1a0eb6a117d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "92a01c9b20f04289d7d42ddec8d74a242379562016e1fd632232b8164328b534"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 181600, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "2de2db4bcf93a36e7b60c95e747b2dd886ee0f63dfb8a6debc12a749079674ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "fc11ed8d472939c51b5c21bafb6651287499ce8d4e42c36ccbf6301c65c05f9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "d1e23a09ffcefd5cb2eaee77e2e54d5a48a760b0aff406ebbd23993198b425ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "b68646a38ad6608f42b4179ed11fc5895e4fb3c57074e0925faf9bf545b145ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "43e5f0b941fd0739c42a5a784862a75b86ec53f208afaa806efe6f3b6686755a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "3d70854ac259278dab85b4cb9f1fc3d91d8350c63f2a0a42b41795363e47e227"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "f3510f51c9d1f14716211d357e39d6505896a559d3e8fa41d906d15d00304b5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "a01afab214ab5cff31463bb2ef771276c347520f728d2d7d643d5b8f56abd94e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "6060cc41fa43c0fce34424ddf90abf5bbe3cffa37a8502a68ad6e76604018b11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "cbd32caffbb5b61fce76b4a5127f165fb9a404e499035e3d2686d25c57c21f8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a1df05f6e1d5777f05eec04a47e554bd91a1d8a12996eb94ae738f0bb8a49bea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "42a582dd5ae953c8a11e74bda1a70ace4be37cb99f406399e049064d113f3479"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "709cae23c00ec47c65f7dcb71c31430a0805e7c5438cb8558a3da7137a1692a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214224, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "c7ac9b89e2788bde05fcabdead23f5bcb2725d41f9ec2014842d6c96a43a2bd9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "07b683116995339ad3720b417f691ebe46e682d395f505b6776de94ea9fd8e13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 209944, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "2a674d761a69d917054a4c0cb016dc7d41361c0380345c46a6f316bb04b787f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181456, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "d540be0e6b1739f99898848927e0413e3fd60f3bfaf633f53c35d908a40bb64b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "45495bb2b29a9e3260e3861f464d0460b8ed7eb69da8043c4d8751416516822c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "6514365d9841cfc599d2171f8fed6a266f410c833e5623bab23bdc64bf266cc5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "b5a769849d5735dad8e5afbf206ac828465c4ead5c5c2820c4d7fb4766dbfcfe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "f8c607f02e48a433d192866b8ecf6d14cf60900881df6a40b2423a06b20161a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "fd79cf7926fe5b6452415ded85c2cd566ac34f86b57c735659a4694d0c129184"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "654fb48a089d0398f4e3a0f42b0d148513a58a8fed5ab4ce5cf55e3b9a0abe00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "6ba90f66ec7cec4f76bd4a3d2805270b6dd347f240f43f2b997d4eaefd0367dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "f8161d2cecc30aaab9e6db383bce0247b7625dfa93b23f3557ac72d569629d73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "234d44b0931a332f58423ceb1a7722fdbb86dd83aa29d1bcf0d986115a0c5d14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "6cef937e6b153d90c3bd8e9d79e20de5f7c6a0364d0109d71d4d67c7905a365f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "264fcd18e6790722bbce16badc2b725e07539826021a568a7d57d1e021e1e4f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "710acaa2797dbb083e282f474661a725d7f2e08c5ba8ed2af287d5aa50af851b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "ca613813368cd397cc4951ee0ef77aeb55a58a46b66032770f456e81543d319c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "b0ee99fe5a0a85d868d13083d46ac0466c015ce0f6fd4e935257e062d7711ce5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "59a4f82fd21a8e3ea87ba4eea7800d349416308e6ca9a53dc739d2e61d6f2091"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "c696b894db5cd1be1845dffb6e18197f74b217e6a750922b96106d804ba39afc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "b3631a19f6593021bd0d83f8781822ca7e04f116640b63a12a806d27480e7a6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "5609bb7c547d5cf621068fbd6703915ceaf5eacb78d533f848e18e7838126fe1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "57f66079127fd9cc021798a0595fc53fdf79a31b77adcd824071d1b0147bcf6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "ce60385395bebc57b9fa939a0338f0a3b21f9f6ff5155acae206edc2c2f92299"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "7e997ea29e7b3820d610983ef4a460a2e7f160558402c37caffad4ad8e968f48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f0bca4a8453f8a043ab9d0700a71c9bf15542c341022c4ce5a17a74d2dc5b10d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "1da7b505bcce3f794faf0f959b5db3fc19cdddf7acb5e0fa3a98bcfec934eb40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "daef2d741f7fdc67630d8b2263e5e9bbd2189d2f79e886b0c3526da9ce95378a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 214368, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "ba6098b3ef5d4c745541c108d9822fd31545b42c687626e1b9e496bd4408a881"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "0c81fefdc71da38298cba914a2befb7aff6551382a86d70b8378f06577724dc3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "a407400ca66a2fad68f3f550e04bd56a41d6e0b84712a303caedc4a74dd7738e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "f75ac452f9988367ce9561f7a992d15f0d15d38b2d913c88c6909597d6009369"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "22ccc6da64bca8b4368a9ddeb5cb69db28e57d8326083fd6253a21c013e7fba7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 185024, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "6fcb8867867abd56ed5780194e3e1412b4f4a623b0a5a3deb5713b7ba6104e75"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 176656, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "95be21da3a91c0f0c5cec386344ab7a8490252e386c6c5911af9933d9e5b4da5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 181600, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "3f00fbbc836b0e5eb5ca2cab67d1a3d048204b86a40e1cd26cbf67199531c895"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "179d8ab6d0a561f452511dc905daf6e84826939b7c446b6041d4586c69490730"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "a078d5980c8259f54e254a4bb30b61bc74e74eadb48865c76b1f33cbf7d14714"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "62c7056da687f533db4a14c08bb2b6cfc59af66cb1a091397bcdea01bead8ac5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "d94d76a19295416b90bd229ae74373d37186bf280ad285e44e3e01436c306e79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "b91d26655ade89b59e90b5620d796f9256fbe1be89a2d82b2643de46441c380b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "a5bee5886f49116d5ac1bfa325f93b679d4534ec780d52de6d7659faef8edcdc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "4352f76d6e87365852b33ad5c8fbe0f30876aaffcb9ce710e15860a6875581ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "739616a1d5d37648b458404c77f5a20395ce8089e58872cd3ab10961f4dec990"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "5ca369b02f366b2e3424a9a861a601a882bc7bc6ae32f0d9fc7f600a40a9a40a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "5dafe0827df25eb9c88c39a36741ecbc2ac24221ad9214554313a8e3bcc4c34e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "c0794a513be1e4d8449279d485ca12a446c0f00ba57afcc240f0944676fe0c14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "8307f475d3e58e85c67bfe86237276aebc1ef20529de7c9761d78f2a102c52b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "4b232e294e5599f6cb3b11c8cbba542b4a19b6d74f818411a3066e1c9612201b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "cad428611b47697b168300a0086d7d4828ae7d6aa1fea7a4e58e0e017d62e26a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "064e24e20443d4652755da14a3d0418f82c00ca1927f94a97f76b59e0f48d0ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "50a1a0b36105375fea0384e192cbebc7c4d389a2a1fa1b1a8aff75592e2c136a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "033d5b03a0ecd0c010a7441382edc3988142e2cd868f340f6b40dbb00b54184e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "90ca80c8f440744beb3f403a47a0d0c42d598bb9720aad47da373d0edf4de681"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "dc5abbc101ac36606a05c17a87666b536a100aafcc717b9c323b3d2501c96411"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "7aee9dd155dff71f46588f5d217b2205e9f043313b910ba12bb3644f8f7cb66e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "b8f98bb0316653675401034e1ae60ee2003adde6233182ea302d865b27a41529"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "14471ba4e09f0e94a0912c499761a3e2d0574d2f10e88908602c3a2a7473d7a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 199336, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "3f36f049b737f56ce3a9cd777bcc1edc51bee959299d1b4b5a6d29f4f4921f20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152928, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "85c192e77c750fdf1d998c0831e52b1f89e1858c870640b3ab4c013e01c8fb63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "a4be45ceaf0405f436db32593d717c5b3d459fefafe9b7063ce2b1159d3ed3d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "ea3b547257e521c7ed85d8088709c78c3a5f88d0bd6b4acdf9f346a7adcbb4d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "0bc0609f9a3ac24dc75320db75c8814ece4476930f939bd23a017e756b5b634e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "5e93f6c4dd3b79757aec3d71b26b1807a7862f7402ad296456f1967754c953aa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "006e98b5dcbf9c9d42bc683c3163360f70e31d562e0ee8ba4af53f94d28a4805"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "d8e51bbd0cf39a0927cdd937e9a7a94a32cc3c6e660890be03c8d767dace2235"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "0e6eab404c9c8e734d3fc0ec8b6d189b78f6b2a39171eb9f5a77186129daedcb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "8df78e018228339b869a17ed389860b3bc9f64e31f3d2a6381a5fd54e40d4b20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "21367c223becbb9774bd3400281178c98fe4697f24ce8e7a32e1212587f6c280"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "f457487f9c9a33bd5767c3dd9571cea523cdf38337e4384f06d4d69f1245e58d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "33e19b202d104d8cc21bdde7092c48bb0888dd56a32c675e68b910b9a15a08c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "baca76ceac7ac3b73f3420f17f64f174b4f0bd5f2a45d3b5a92318ef04936326"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "3a32832e5ad30eeebebfa64f265b6148003438396c05dfc6b488f4a779374684"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8cc27bb6f0ca0fe789a0088c839795110ddc5d77db3e907bd30143961a5b577a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "5b6c95938d5887731db0cb0a0f3a7596d1c245ce7fe3348e02ea9709231f0f40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "394acfd14ede7dc659dced73774c620f1c4e222d5e9acac4c0d0696ede87d09d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "c426dbb5e958b53cc3110ac392defaaaad607b0d7262da52cab0d406a2b66fab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "1bbadaef2981259043084bffc8ca70df3beb51811f6e567a5b0138b6f01e050f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "f47205c9d041aedb67f4bb27ef1924cce383f0f1ed520c46cdcc2e38279830d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "512bc5c87a109fd25e53e131f9a2d810b6d831fce8fd054e91a640501c8c660f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "a5bb83c9baa7254747ce6492ddc89543f794a1e20075370101d1a28ab2ec8fee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "7d06a3eabf3689d261d56d4ee4f5f8094f43c00850bfd9b16a4e646c9e05bc97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "55cff434dec6e4172f945a6dbf54fc40414ed912591907338d71bda265e1b36a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d143aea5ecd7a9ef117a6248f19fd674f0ef2d66aaf30640d5cb7a1799edba1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "39079f103114ad5c5b6d97a49aedddd0c219f3256b2c160a6bc144a13be87ae5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "53e3329569041ccac79439533fc171747d059af4d300064a73b20d8fdebfa50e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "711137cfacf6f7f285e4ed304e66b874d235916875ecdaeec2c422ecee48cf97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "e9c1130c0eaf4989ae89d776d736351936d349dd0f4840cca05a3a9bdef642b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "300b54d736369defc08ed09eef4c7c48cf95ae0c7645affe8ad13b62b3233272"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 168784, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "31535ec733b413fd1e45b9f0cf8ec6ac57cd9371b2bc20c4445bd313dea8ac28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "0d4c26fa7e58151c112afe76610ab7b4fddf3c0296e836c0043f03d863161654"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 157184, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "ef5cdb651dd683da0be13c956c35c7ae11c836c627e17b3cecf4f09e124031e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "9ef08033eb7708bc1cedb115e5e563927c09c1b11b09ac3581fc7efbe0e3ffdd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "322b312336e462f858cef266d695d3a64cfe046733a75c18799509d8917734a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "0a189248d4431de46f843212117b10f2aee52db9b63785120223bfd4da303b3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "233a7a42a70cec37b5c205098ed140d987c34ce1a1b801aed57fc78aa1210b84"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "f2fe35ce8ea08c912d375d54e970d57985efeeb83897d338ab652c592f8f24ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "f463ce72912c6b7974f7f623a91658aba89127d51a72785e4eb46a87d4adf9ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "08c745a218adc37a1f92034d1b64476fc5472c5965f2dad770fd40f567025755"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "a8e78c78bab8b66cf5189c5447b40385ad4ff489aca04850aead807c91634ffd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "fcb5471ea839686459872cbe264f1073c2cb9c969ce8a9b4ae6d45a5ac57f054"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "27ac117e36a896765e29f7f09c4c60dbda21ab9e52ff678a6c28e6eca2843dcd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "22e95a54b6df912257ba702a10ae40724cb2d4554d737cce1da42eafbe186685"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "16070ec4db6f9d06fedfef29124ab0d807e84da269910bd25902d548636a8dc2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "a39316a8939e6827902a65e40a29977eb156b938242c280fde60978c98fb3637"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "6b9aa2662e385e239e0a836670e085780ecaebd8cca7638d0cde9474e39d045c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "462e72f5d01e43a221138d9bf74108a84f3d5f3926ed6839815d8df1dd277b08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "67df6c4ab33a848037ca460b5bab044aeaa54d9a1fc2aec3bfb8e593d360e76b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "3589140fd7f48cbfa5b47d52164422d40be3d0ba1b7a79c4282d9cbfa750e8ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "bb47a74d9ca97e402088077c441d3b7a970ce2e02bc3e36f4aca71eea51d1a99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "727ce42b925e50f8a7616ffd94fb811e91e350f392e25662fe8de044815cc2bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "b9331721537af21e3a9277d3401691825b7441be2e9e33025957b102bef83541"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "a29f2bfda29117a6f8b01bce80fdf645275c39847b1b7b54dc187324dd98b9b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "1f4b64915089c6c24366ba5ff775e4bd082fcfaf83a25716141a064bf974617c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 199336, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "a206974c79c5ff921a1d8f647c216c5bec14ed6758e4796d576d5fb989121c35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152928, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "baa736fa02d12e6755bf3d827fc08b08d2e43bf0d86907a47c0f19e271881e5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "454d84b174ad9e0d25d5464708d54d76f42db5bb7bea78e96243936e304d89fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "348a4db3012126b56bee9912a79757fbd6204e7a27b31c16098c3a8426df8232"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "83d2962588882f1ff752d8967bb42f02f5f51cf5562e1949cdfed9d3f46d9880"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "a7c78211ae95fec8139a3856935eb9035c7a8a24be2c61e723178ea254806895"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "51b815a3431d7647fb8afdf46119dd96c80288963f419c06af707e3e1181d7f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "e52568249ade7ac525a6209aa9e5ce1bba73e27618dde8a8f3a08aff92d51273"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "30c1acfa146666b9a0d9f62a6727acbd8c5fa18058ceb0deaee011da712a513f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "b31dce805cff9f1642194b5f2207a7684c5c3fcd92c3c3d37b732be745531090"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "f6c7b2fcc57383d8f9ec6682feede2ae194f208c35d9a7cb9946932bc5e68300"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "391fedee4c5db4960bc1acb4b4737b696802d8b7eb693af3ee5f48b836103b17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "aa8049219d7dd6084adae1707bf05380537f23135ab4dd1a7f1c689c0074a172"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "c74376d68ab300e33a31484c38f3f7cd3ffc6437c2dcf4fbf17973b006ee3ad2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "ae7d4ddff44c2989ee778af733ee13f90caf6311d8441ad7a8198832b50b4458"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "395b97fbaa06d20942bc4d399f8137af64551f84933de1aa60b583e894c4e65b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "d60aac3e6cea735629caf1cd3501d7ddf2746dee911552cb66215b5fac40c67a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "a766485171acef927647c2ea07a8e5261a137274f64671cdb1ab225269db185f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "87363c7e88363b02ec022dde7f964b160e1eee37b3565ed0d784c2df77e42d00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "123cc8ca2975a5017a13987af51fec673751fdbf8c34cc118a8bd2f69ce91f73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "d51bd08cf69be6d52c98975ac50e39bcef70b90a4f21db3f473d28617700c6e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "ca04cb2168f0dfb84196890b32f7d787c610db0f0b35a0be921f3bd19d8988f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "ec7bc4d292399066c4acd20e2028fe5a6af46c66604a91d14df9e31498ad0c0d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "b87c54f4546ca44eba778036cb9f5b577ec31737f80836c718d5f7f6474f9218"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "16794146f901a3ee5581f2f1290f6ccd16410990c211a238d65ed114152e2aa7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "fcaa3e7c7441d3224da0083716e2ce89c79bdd6450cba1d8a080fe39a6816f2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "93f4ee272e1afa24ff772f551429c81fc48d55f925a3c42fa3353b39813cbf96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "725e20558757326cb74158398e8c4ef58b828438b16a262fee1f585f2444afed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "12477c627a7c2f7575d59bc9292cd5313cb758abc49ca04beaf30d8f8860602e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "0983f9a05c5362a95f76ac7580ef27b455d741d92b50cdd6947040751c260f5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "b81ee8b4dc3ec5842622fea1c34c84e275c84d4a3dd1ebdd56c0e48497191ccc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 168784, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "b9badfb570c95f785e37139f7c9d8b682cd9aef90f6cc836d4cb5bfdbb6fb02d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 164512, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "29a0484e563c905c5b839c267f1de174aae33ab57073077bc6c42ece22c9fe4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 157184, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "afd493c6e83217080d260ae8d74e774f18936e5df8e0482210aaedb70a398500"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "94577daf4c3c5f93f6731d3abeabea1683f904df3ac9bb9815c84ae715a01b64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "e0f5549be49ec8594c7183a9898fe27998934064c2e1807519e081bb9f83b739"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "45dbd40433c0cb45f6493f5ec9a97b96e7fbc7c4797a5540bb56d4c51019501f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7550e035629ea2315b7be68251550e27a2e4fcfd9d8a50e4c8b92e84ba99392d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "5ad9c1d85b11a4bab385e378ed4fc3bc3b494576062154a023854dc1a3be80bf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "8253779dec2456d398eed24c421d21f9437f7bf18e1c67bedc0321557d87f3f3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "12f806fed33e99d34f66c374111e5243d3980419172cda19c46241ebe1418680"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "003be6ccc42124dd6a5feb17a8cfe6139cd62b93f9aa4edbc4b2c95b4ad7a5e0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "bbad3753a700ec7f4560cb21667a42d60e4c9284b825e9f2560a936018feaeff"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "833a9c71cefa6e3a97b2ee39a16b16309a1db69c8a0fce1b070ee6ad7c5eb6e8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "0bd3fc303865bdc7ae5740e45bde6fff297810d6adb1b95b83d76a0c059c7544"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "431535b401d07d0bd2c1456b8fc379ec1cef0072244a13392f4103c72eb8c0fa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "7ca3078633694b9c475e97dae40ec0ac6759cf320692a34501b41663fd98f7d3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "15f2e7d8a39465ac6e8a36476da25d453e2bd9fecfe40c4a3e8c28a107bf16bb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "22497d84fc6854982416f7041dd170f24673eceead681460fa0943c3cb9ba1b3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "4ac45d032d589bc38b7668dd4ae7c9c50abe69909efcc6e047e9f246eb739156"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "f0d939e32ab1da299af2dd1b9ebe5f98235f6cef127d455948e05ff77ff5212d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "fda8c38db7eb5d64273b8ac325060351155a0672fe060557fd287803152c06f5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "fc56ede2c249c7e10dd79faef0d202a8371591ed240d553daf2346add31149a7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "3f4c34e558e0d103ced41975cee9103fbfb4cd00f9cc6807a7d02ab0b13cc78e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164992, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "d5f7261c25cad9e691728f7ad06b560a6985ca20c42199ce72ffc24c7508ab21"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "1351cce061a439e8ab061de60e286d36b804a13e833f1db5a5c5c49216fdbc29"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 193976, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "51616f8509287795dacdbe2b0f45be03a647da0b047a07e7333d73648ff57c93"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148608, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "1649e066360868ab9daae37cfb2ac2a160566f79fac1344d2e187f9c0b67e456"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "9d5e92ceb7532eeaa82377d17dbce06e3554e4f6fd46bfae6ce05cc1e7f4481d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "28ae1fe4de85a7617056f75ad18c55a4f39971c1d69f51ec4e686a1373d82e6c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "24abd13e23a83a11fe59faaafdb5443bea5f72d017f236e5df04bcc8e7dc01dd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "fad1fe826434bc810ce7d550814863633ca587dcf272b6e4f4fde88cdea01c05"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "3b7799c25c59f779e826fe5bb4935e10f25c20519d4443d0c3322e4ff017df32"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "ca7f058d80893b7fd7e52a36d3b163f8293234865505eb2e425641e5480e318f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "35e0811080ee41d9ce9b2b043835309143295da79a9db03a5e0c38c3b6020217"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "69aae41076e2dc5d085aed617eae9a3fff84d101396a817b07e1692c52234888"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 160176, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "38967de42573fa9c95f6c84f9aede3d00e3e08ae69573ab2ab21ee2060a6240b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "1aac21b9ab07076e83924129ea971803da6eead1550117a1f755b96a72486edc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "fe47dac432666d8502db4436e75ae2126f2c0f5500d753181166c32f10aec427"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "772af6d61a0d2b8f98f764b0df087791adc61657ff7ae4f231cc41128cda81e8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "7331b612ffe9c6de5dc2cbfd1ceaa72e5f964b8ad441077912f7c3af1b683e4c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "c6db782e1107aa89da708dfbfaccfa0388ec43e5ad05e460615b5fd7bc9a5098"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "7417f809a6d3a45920328eb907ab00e90b382842ae72b60a6104c585906e8a48"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "8f06e9f96e9b87338985c0f73f8f37650aee0ed02361c727a9154f8343e7cd47"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "d87abc4f66c8d7ef9e1e948ef2a8150b219abe310daf7e359bf40e02f7482ae5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "f92452c8fc0093bba605a4f54d1d2a7c5fcf512ea5f555c4e278d42deac94a71"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "b6303f5abd4032ed6a918921cdf96095f866572595523d291ac5129fec7fe86e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "4f14c938c5d259570bf9c31b47f7f8c833ac6cd022eab004e9f012af71a5656b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "23006c9f76ebd71a42edd7a92f60ae51ccba0dfb00faa39fb4e90f0208f07442"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "3c114d5de8834a3e735faa188e02fe53fae0ba09197297469c50d2a0299f86fa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "a9b858f183bae9ea4147b7a817b2d6073cae1ccdd3247e058154fcc75735ddf3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "2aca2e84d9c5f61a432f3b78b66aa3b31918ce2eac5bf88e06ebf854eab5dc65"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197920, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "7dc461c0acb8ccb4f5f15f6491c1573be17a9af242cc46e2d9cd3c5702cf68d5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "96afc12a96cccf33e2bb2279ef592f137d8e1fb3706bfa65ac97a66ae2cc7d75"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "165279e6349e49acf7f6f6063b092cd1ff7830d197a36f4b654335983a6e46ee"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "b689339d78da0b77886e3b2cece46d7ed1209703643418b985aab1d4f8f4d932"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "76ac78fd1703b99b5f1dc406fc222fe20ff218f4606c7190e29a0b616624637b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 168544, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "ec5bf8db7a03431ef4112d5d1fabfff392e013295fc5254614bfd3a77b0ae21a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 160176, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "a0558534bb10f84defb938c5c518fad214ceace187a688bc430d997bca132d88"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165152, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "90f2613248c24fe8a2ade409a3b978273852b692313e628ab2df25c34c59cba4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "af5179c9193497ae34dd34e1a0379d1d5f29ded76190d386eef6fc124034e047"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "8f6cbbdf8f36792a012c2e60d7e898337805f3ffbf68d69037420368f3bc60b2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "7b9440e633d3ed2cf3a02e952448e1af5520d5feca4ebe9e0c0ab46df35dc2ca"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "d86ba2c621e92756517f234284119da15c10bb2e092d378518e5e4c9c5d8e675"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164992, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "7d5002e7382eb91318091ad25197a8418b2fd861d9dfe69337969ca1f1dc0dfb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "11c0af43a4d4fdcf300ce753dd88e5a6ff9fff1718198c842f8845b252cfc383"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "d98933b74cb75b94a4e94be74f2972604401b334a0ef77f794eb241e9eb0486c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "15e536deadc1af3dbe389b21cf6d56201b4ae28de6d30c20463b0dfb3cc50936"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "ebbc0fcb9cae04972c565df7030396b19095eea0a5af6eaa4693ff160629933a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197920, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "ef8776f898aa4cbe4fcde387cba48c18cf38846c1dbcf120c9a6a58a739a737e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "2ba0fa4e73fc6674b3d604ffe4199889a2d60f52d15270387bee5b25de978f14"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "65a3a08442f101dfa40ecbce2727d7436e77f761663dbf25ad65c6365f2d58a1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "14a2619d389852128009d1caa678f46c5ac7bb85dc01c2b16f35a6e87c2bf61b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "d31fa35f168b4c49bb3d9d98670d21dc4df1bbfc2a321bdc7180194cb433e0e5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "64d2e8bd73ace03f46934fc7762a29abe26475679c0b56ece03e985e9ae410c0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "c37eed3c56862014c1a6d69d04aa1f2f0591385fa6d71490699ceca974b40919"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a0e881d407119d90dc0cdf7aa0f333d2c802418b019250674d505fc7920a7768"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "9407230b6b2de00790438ca0d409a3a47f2d0df2dbf8a496093f63cb5ddc9158"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "8225434d8c9b2ed5994b640086e23a63f11cdf985d931c8d30b6ab8ffc2fcc11"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "e8db90a948cbd4206908467a2f7339522b99fa0cce338a52ffbdcdbb609d4a92"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164992, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "bbc81a0983fff7d1e309f8f36cd16580234de875bec55042950a1ab3b412faed"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "8731a20b5a3d79472b97b113d82d92a5a78eea5a095b68c7810dd32fb7566c87"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 193976, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "ae60c316c31118baad8fa1add6b00443e7ee4bd4fddb7577803f01d346ef7f4c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148608, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "41c107823aea9bc8d6a2b9d40577c4b5b3f84b5f85b8cea8c67ba46eac3959af"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "e5cda20968cef930814361c928fb9162e134cf119678c34e9d8be21fd4a1e044"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "640524294b0688bf24c59ae761912ec61452f2ff153dda6b00067c6665826036"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "ffcdf1e95a11ef1d1b46da8385e30a6868fc621f0446da702b4956ceb343a999"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "ad637caf732381b36961e462e3ddc82d6e98d17d289fa114a826561638180f07"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "2a5acb8b121eae287af0065419c5d6ce6ca89f55a8632b1b303dabe56738231e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "8228a1e02688caae9c57287b7cd6de97104018057e5372965822f70b9a9ff556"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "22e228fa1217d56b126048c8e4fbe496b483248ded9b40f133380dfe031e5dbb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "7b2f4a8d728ad42ca759970111adb89a7999fe4a3f2cad4df1ae0d6315576ea2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 160176, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "d85d547f7b4ac7a1ad4d8236fcffefbb5b6630046f5d469214465d817ccf70a0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "3db7d2b6950f0efa094b82d37ea794beee2580aa70f8142fee3a1f03f02e1842"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "6948ce996de316cbea9fe96e843616674dd1368a7a190c600d7995e7ec6b8919"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "9d74061869cfef74826e44b8a9c14d0a95c0b2653c94c3f1dd60c85c4a80a571"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "2fccb5da4a91e04fcddc529fddc5dd6a7d0d7e9ddd9ae45f24140afd88e57282"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "2c1b3c4c3915da652bb5173c028e0f3f6e17add43100e21681c307a75547dd52"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "696fa88c950bb4204c43bdb27f9dbb604c7786d69f5cc5cedca40f6a504a5f03"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "fc03b091c65bac6c7f5338213f24a31e82136cdea80945ce20aaba8fbf28a139"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "990d832f1310f4e749387a61fe1a9e84bfa864f49dac555e4cab200377f27c73"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "253716234fa0ff1105ad71bf91d60ca753137a6f82b4b6c4b73b6eb170b833eb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "8a234568389fc08740a8c3b864b90e1f33277af407aaf77d4ecc8a941a534faa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "45ccd003ba0cf47b9c6cb4c09578a283e499d90e8c06b3b5aede56d3cb2d85e9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "69d103c1ae2a92ca5c58c89576ea0f71541f46a275fefc7b17e980bad65c44e1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d081bda147ed9a3370e90fb846ca8fd5608acc74493adf19b9fbe8959cdd5c5a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "2cde7f4c4df055e6da96f39a3622c2d36b33ac865eedc006e18d0454081b71d0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "f0a5dc6ce35fa512ef84b33072e74b48ab86981ba975690b887c35a66b001ebf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197920, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "ee4fd260769eeb6bf4d634d2d75b795faf713fefb14dcc401c04fba9470e4160"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "366e323487a5ee9374f53cd0724a66af8cdd88b0374f8f4d7e359eb844059017"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "e105ad478e062b6ff45b9ed2002232e684825630dbbcba3d700c3dd38dce6582"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "0e24a5c9cfd39fae3297111634871d957192d1cce93e80f0359909f97a1a819c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "69741c52f2abdc6c49b969e0a56d6a2a77f5931cbe6effebb50b770808a6ecdb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 168544, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "4c840fed3c098a11f25fae67d7360a73108cb01b851bc1136e3b40fc345994e8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 160176, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "dfa7ced60207eca241e647e64636d4e87755ed83d55b3bcb66afe7b09bbac48b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165152, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "bfe75ca803b46bcdf1ce903dc0d336779ac7d336fb10d268be59c8e494c7b16b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "25b887206d343ceb7db23433eb5632570c637128e7a3511f11653320a4bbb80d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "09078b6e8aa67e7cbb8307398b83f9c5de19a4de9f8eda0924fb157685f1ee5e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "8f541b80079c7b942b881c0762b4f422a5c138883d52cb31da7b466ecde36528"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "78ba06069d30ca1cd3832a8f09378ddd863aa72ee35b343a60b409ff5d1671e5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "2d2efb4a4a542c4bc3cb386b01898610cc80434fecfa5c13206307ed123b7e2b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "e9449ecc213b9ce39dc3ad2154283fe8d2a1e9f7614a3361c7c102d4ae8cb7aa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "43df591c06b4e1e7b3f70c54e327d85e4afe11055ec3727456b76ffc6daf3424"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "3dff19cb1875fe96a4f747ed80f795a486af37d95bb5830ac52d68d34af03e29"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "b66e4e23048b13b5dfcf53e974f222c31bf0bb633d741743e3889bdc792a944d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "c69a1a3a6963238d4630485f0fa5d4cb6380417762f196fda96cf2369ac5bd13"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "b8eb64cd0c17b218b0082e58fe3a647bffaaf92d403f976e48b0325f506e3630"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c4ad0424d7b5bd40664f757ea743f0491794d0ee5d2c707908e241073c9349d2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "e10ccfa7e2570b091a596f33e586b515a18eba79d2c12b3c2e131a9a09829e7f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "1c2365d7b72ea431c40fb060ce389d7763a6310e3ffbccb6b20f3251c08a6f25"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "a0c962f14478107f74f1fbcc9c00a450d995950f0b7df15a6aed597366eee673"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "dc12c99ca0c5a8698759243171a772572eefa04443a21ee75f8eaad06fce9284"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "c062b9d60bd274dd141b93f4a119691845dc88a70309874cfd80645c9d0fb21a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "06580ccb5d61aed87017a50a4c61d687f3535d98c1d86c70b42df9879fcc3ab0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "b8aa5ea66dd672a3da794ae6ceb0f9e241bf24a364bec740c4a110a6a2229a73"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "043f70c308b20251c6ec9a81bd13da9c6b59a0d7b020cf9fdb08e9dba3c483a7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197744, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "dbfda8579139ee12291af5798bce7a0080086e54b1639c4ddd02edd0d02917cc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "c64fd179ac577d17edd72d70653ce65ec7a8ce3939997881b6a22799d0be72d6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 201656, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "a2becc166cdf86822582699e8967767011e862a85066a697e61ed064d41dafaf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164976, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "106b087c7e9ca07a194426a5e049e8f29700187015c327449ccc6fea1cba462d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "80e358b23ef3ca2cfdc812cd6e0f4188f5e8f19bd76bb69c131dfdde6dfa8766"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "ab65cb1b9b75cbdc4f358b26d3f52d7a7053b872e0c4895922a7dc2abc3a381d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "d0e4fb4030d76baa31d9cc4289c269b75c9d92ee97accbdbfdf76ff0a304f48d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "d363c051b510eb3bd80fa7c1714e98db234ad571689cb8402dfff83f2a42d67a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "34b60f76be16b228426ccc0d43537eab3963d498d5a315ec4d001858312794fa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "0ce9d92ab0bb2f0929786bb9c4c884a8dca724421ee60d9938fa1ec396824cbd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "434324a0cb3d005a521783aa46d6e841fa6dedc430e59a596726c3e851bd510e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "5f7184a0e625145de83f030cf0f55c8b5334811e6b741404ae088998e8e16f44"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168368, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "ba00183f52a19ffb558e800dc0568cbda38ca253e3bb68302b90b0a39ef31001"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "4684b33cecd4cf34fbafcc57a09d621200d794da8eddd8dd1222e90e75d03bb2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "d23b3d446377cefb8638fb2292517bb55db28b0921c67834911641d0b4910d42"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "ba29e23eb1d1e7ef2bc448674ec95a2e683811790caa4a0ba33615a3994e51a5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "faae161a7c058123a8f61d1babfd60478cf00e23db3100cbed4049ca2d0c3088"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "9bcc78d028945c8c38db3874c1c2cc627c7b0bbc79f0fc1f668f060037f92879"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "07adcfb6bf16a9b8c09785bdd6420bcf52f08e86c96134c27b9d11f585c46af7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "6ca183c64962a7d0a71c83b30d3b2e94312da1ba61f68105a7aeb558a64862d8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "ca26e158b53d407cafb519e777c837a4aef5fdbca2af9c13041ce2064d3f60cd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "0b9dd24e353d064d442aa54b3d1013e444004492691f90b81f5ddbd081d8cc05"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "611cde0c2755e7190d0fe7eed9bd15d1d6625af9dc378f7854d68eb2e9fe0df3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "919b5415d0c81c7a7348dba827737f31e7e7b943335d423d474ee052110a14a6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "d5731e4b958ff264ce7d71867b223cef3c9059d526837ee481e6a34b65e27488"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "2ee9a52658bd87a1a117bc48c69a288e2e6467da300764fb58f5620c0fe9f5c5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "12c6ba9e5268f6c60625249b381e59027e8264005f69b0966f891f83ae276c5d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "f12dcc1e394c0ef62d9f2e50107bc959cd2d29789fee9a39ac91c9c2d37d1d2f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197888, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "9523dc7cce827a79b5a3bf8da06f23ccdf7ae3e4ce69323c3ce93e7d24a2e25d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "2ec0a79341942155a90d35a98ca8baba4edd220862bad40225044ca14f350f55"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "525e62944e760b3779610be852429ac8293433a4745ffea9136eaf00fa389964"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "19325ef5027ed09f18a0e763f43de00103abe2fa45797ed2ccc553f91b87a5f4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "0578cbebb0b3c50a2b1877574e9902e16304d0e019da8d3f2ce8cde330584588"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176736, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "8462cd090b2a2685ad37fd40e6bd321e48f51c396ebdf88623e061dd00a3469b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168368, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "7100d2448f78dbd3079a721ec80b7d95229711012ccb9182fddc8be727d32cdd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165120, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "71def8a414628d9cead24930d8c5d488431f87aade767abb6e30b719bd5ab7e3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "b7b6bb02055dfaba94adc86fb706449112d2173987c91d51ee6c96195205d547"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "59b1da8d0865b4b231139f0e401a8df55d86f9f2759c5cf7b2c098a49f90540d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "8f12664b023777879c7568b8574879b649f981e89833751beb19967f3932c2e9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "edbf53b90bbf2b487abcdd4c209588da1797c0c4bc64b43f6fa3b28cc3cec16a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "565a354d88a8ffd7b47371326c0de52d6574bc99de4488703e41948601b10be3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "aabd91432319c6aca487744c477881252728004804fb64a73c445ea52e76b119"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "c90da7cf51f71d91b436750018ad4cde6a901963239754482e11443145bda79d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "070455e19bff030cddf73002bc1370562be5c16fb29f0dead0325d8bc73ad1fc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "f4a3cff8c256d4e448d5034fe14fa8d20ff214a2f5b3abe4afcfc1a32b558733"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a9a01010ee673f66086509edfda5eba556ef95eebb062e251d05479a86d909e2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "4cb41ca427a35ae955cd4bf28f76ccdb8b8e4c2f1811ff52582d8a0fed55e397"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a7d45c73ea786d91444042d7cd47fc2e11354f0e3b38e5a530f2349094a8bbf4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197744, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "dc6cef0f882b54e13d58095cdab74bd2b98f11502006e416d158fcb8c0ab856e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "47042b9be6488754b55d15303628fd1a30847cc5c738c306d9a0879f94f2f0ba"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 201656, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "e926bc00ef86c8f4d5e82637290c2bd146b3ed7277757ae374f3240904a2a7fc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164976, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "a55f67e5a6e7670a95e178c1d44e95b415ea1c6b521d3e9048aa268346e1934d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "dd6544c0be0993ad1d9dbdc9abf7ac285f6d48220e0cc4fef490f9ae8c2ad549"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "d9c1a179c94abe02ab42189b0f764c674ddf55f8b8508d2855675ae1093c658a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "270c1166ba90f01430494affc741c557f599b5d3b9a756b564f869e9c7e3b8dc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "10ac544956f6d1fbf5b6127d1eee2e36e08b6820711fe60bd022b803e64794be"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "df59fe49b7534d40281bf6a67149721f7a8c91f103407692287c919e08784593"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "e0ea0cd683737ce00a71c0d433d6f60877363d871360cdd484007976318c4313"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "5ff6f4158a7f309bf3a2c4ca0cfdc42b7d774f6a98f8453d86e85a8fbbd1cab3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "2a726105669121f5291b3ed5ad60746c5c711ac0fa307904bad84570f53dcde8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168368, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "117cda7dbda9245e415fcfb2228fba975808ed61737e96f063a1b9aef14b4b6a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "161b05e77ae7a869ab46c2e2256111e6dd478c5de09e1d19e9a87dffe96c7214"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "36e1e434e381b90e23d446b2168a4be383ff3c4acc293e66378a7785c0b0084b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "316239d1d8c5d165306aa1cd230be0d6e24c816094e1d529fb6856dcb02f5a09"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "cce81044b41aa944b29d9705f1ef97b750c76743c3a189fe7cf481da180dbd90"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "65399c9c6f4f0adef40e571621599e1ce1fdd202340b3b9fe4c5dd2f9dced7f0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "4414a084893097020a1718a7ffacc6288dcad8a594ede337ee8586b755f8285e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "3bfe30831d9358733a3ece0bfc5a06ae22d245a206ddae357d1947184fff6f08"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "474bf3839abb8f9ce6bea735631e27fe6370b6be3c8e2a90495850a3501443c0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "5b0fe0eb0a364a26b4fa771f16fe81ae99daa3f8fc9b0ab9ca38b2b20a0287b2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "ec9f8cd437ecd9733781ef894e92161c4ba320a0d9af1327e7cfa6d57247cec2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "b9472091170931b52fe45430c8fb88f5c6db297e8eaf77f2b6a3ecb860a74fdc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "fcf6f28a752b4611ff9ac3a4389e958bde9efe66dc75ced55aa7f2a8a02b5de5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "095459aaa834ce6a00b44e70a55815662c6c4780b96b10bddb786ea385be8b4d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "84726b5f23792fc83d6c76372228c767bde7951d3873b3239f7d005cda6158dc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "1db26ccc20e841292fa290c7675e5a39550664fe4b6f4fc5c86f0f0971fa91ce"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197888, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "a726bc4081ed462cd097cef55836c2f2e0031242c500cedc9e65a663f83496ea"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "4ed46d5beaf3d3ca0db75fe70a349ae6338c63834cf7e309b2c6739e35011c65"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "0d0ad893dc8b910a98ac3837fd6acabd51ea41aaae0b69a017ae82ba372b95d0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "dcb0b616dfa5e02d0920c4668f4dc8e38d886c87e32048068dbba87adbc132c1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "031eabc5a6e4c569e3b9271d0f8bf61ebed341eb15239ea4c47a12542047b427"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176736, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "644b7cfdf491370d08c8260006762775c1f4fdea4b6240144b1a7192a4b224f5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 168368, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "b2fc5ab5dbfd10300dab42754059b3fe84888a4cb3b1bdaba004526326caecff"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165120, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "d47a9aa1dfa62d118309d3ce255eb88967c911741f3eaca02eb6b486e5a9dd11"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "157b5ca6a1a04798471102ace2657346e6a7ba5bf3507e6f0d23f77eaa4afaa2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "716430deb5cd754519df4e1844e5fc9cbb634cf727ad12bfd643cfd789dd1873"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "817a9359521a4a74dfa25e7d703f6abf5b1623163889c635b8c3dfd41a187925"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "2e67493b4f6eb64ec6871ef1481dba580da7228667d4ac6d8ca9c5f03e0b8d1f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "2783909514a3b1b6e6cbdddc06cf4ccba323d0d96d525a6227b388165d90d907"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "4464508d24e7ad5011113bb8b57f4f3213ac034afc4694098fc0b7d4a78a9c20"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "cd6fc6683191d275a7c9983116c135c553b6af8ad7c0958ac2737d5194fa6a3b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "6152db2aa3e1f7ee3aeebef28427b16b2ef239103a09af9758646c8410d1ab9f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "35fa6056e17a26faf31f613c624b4f84fe7a1d9e96afb6cf1d9c11c05fee2a6a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "f665c8ffeb4c2769c859499fb9eb1fb0afca21418504dde0157ff9fddf3524b5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "edeb32c57dce3cd84396e5314c0191201c108520a824ef67299d84337d894432"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "7bd94c4250572e5f430c627d8ef96417d096d006d193b47330b7e031be4db814"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "02e6c96a7764714c77bb721d987e8e7387035f0d128b43b5204bfc51d9605fd9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "4696c264e5285e3b13e0a9ceca9142e27b5c5ca5fb0963dcc97af8bc6c191521"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "957c977e1930bf64b5932734211a51584dc7c893836682c0d9e9a064719226c1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "2fa8b350a669540c7d37e616c62b5ad9a30c9a397cbfdcb4b377610c5f12f46e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "8d7a24e872cf0abb16273441319aa8157897172b6562395347d865e6024510c5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 210104, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "394b617295dbc1b8a4e587743d5646b8243b2c21f389cb3c6ec1bbccc216fe05"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "3fe5997220f6cf6d0a1ca30c2482c8180d54824edcc4213a1400f50ac84e8182"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "b24f8d8304f9e29436ed5359759da8960ed5ff37011073b8ff8c66b5292ea5b4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "6b47cd6660ee6712591d13aff30f79b89dad6976f2c53ad34e502739131cbfb6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "910373b01ae613c1b8b22af7c84a09ffe1ab194d98fc357be5a04525e79b17d7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 207368, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "591a93ab20844137711b3d63bffacd382fd8c4ecb84e487c8cd56eba30e9bee7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156880, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, true, "cfc76f9a5b371f2a6ca345cca374d33266eb42fa0715939741f6e7a2b05d42d4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 1, 2, 0, 3, true, true, false, false, false, true, "03dbb2a1a2d43b9e78db5d062d77de15edc158e6885e9c17d2d9203732fd9bd3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "66a576f49aa2161ffbc61660df783fdcaeb9b1e88b169b2e71495c65542da5f1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "43aa7420e8930ded09e2a8a1dbb884bfab913a643892d0ed897c8d1872ae12b9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "376de2da5ac3ab24dbe8c73b11a5db4bd3dad23a236f8ad39735ec6869555b0a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "c8f0f514923c2ab20a728055bad7326f20f1ba23d2effefa2fb47060f524ad89"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "912317281a01f6588985d918da9ebc221683a63704355634b3ecf267b39833a1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "669155061262f9fa64d1e22e2680a6b957c0305c73998c37eed570dfd1bda016"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "6571bb0a7ef993ec685189fb609110d230e37b66ea936e2b4fa76862ea24abbd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 172544, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "4a166e5e4b83f3eedd0d8a993fb88452937c1b1169e1c16a5070cd6ffb3c3493"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, true, "5b9eca22b8fccb1c0ce0ca4d20f96dbf97fe0b2a9ae5089145ab53c4c1606120"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 1, true, true, false, false, false, true, "4783e0857e779d9e9b2006899388b445537d393b8a9cc1b3430d514aa7f89643"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "196f5fb43206e8852da1e078be9b1ab039663cea7db44c63775275f16c572163"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "e7785e06047bd803dd80ac4f7b4731152ac7e6649f33235619463553fc129992"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "baa494f1cca38ecb2498965d4d2059efe5f5f5cf66ae4f05fdc8271389ff033f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "1800bcc6838aef5ab9639b19eee70f02ea5de00fd90fcec84b81dc8661bf7988"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "e286e9e377c950a2501e29c3956f38d6904ab4e89afb2474e223cce0285b26cd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "9a0cd1eff4026dbb05a126f2912d72fd2b461ec47ad8c36b06c2cf0299df0c5d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "fa2401e76f676d92175c7151692d4c0565789931793cdd2df419f6ddabf0c198"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "57528b7d60054f41f6df627a3f90bb3a7c3013f3ea136c3bec2de17888b6ee7a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "39b7de2479bc56080b010a7f7240040a56e1d5400656586490e8664d8c4cc00a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "86cb9e1fa5d67ed7c0f0b45772715096d2b5cb7dd8a552ae4c333d5a70792aaf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "54f9299fcf21240a120ea815b7acafc4f5a3a13226ab486385ce0cd7cc9d73e9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "20d136301299fdab4872e6b8402c64786a41cd758093b35d907dd9bb725d75c7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "5fbbb42970b62395ea48b64c7eae8568035992e7c9df1d4c81a7f75fa4faaabc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "0d06e5d42b1a78a2b307a536c9fcb64de81258a85cb6ebbcdabb069dbfe94884"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "5d23387e53564af7b06bb42e3aa732f2d4eed37de3548b812b2c77e7ebc85fb7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "aacbbac2496ff6111324cb3e5752d370bd4cb8bb4519eaf7d7ae916353af2c31"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "e16096e9f7ebc892cb4364aa6aeb324cb95cf690e8825865a3dcd2d87a2d5e7e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "af9a9884968310acbec405518316134d2978f58d1ac2c966415ef257d2c8356b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176816, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "9da81d93e3832350b9a835ff6008b3725044742691585feb4af588ddf93bdfa3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 172544, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "3e38ab4dd7b032163edc54ae1a24e4ff6ea494c26ddc5e9631230adfb5555c4b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165232, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, true, "12e582da4186516621547d90186047f5308f261570718859d85c3fc4a29044b4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, true, "567fc73caa7452bb3fc08d02672430e11ef6ec03952a7f0741dbb8ceeabe8fc5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, true, "bf536f01bfd404902b3871cb20890132ee13513054102fcbd87e836e36f4aac0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 0, true, true, false, false, false, true, "c9104692dd0c30be24a35a82c6ae9b8a135f203331a4f20157905b44ddb30f29"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "0707e2680b1edfbcdbc772ce7a1b7281d85ca97690e548a2e039fec7150e8189"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "f54f999ddb9464ec54092aeb69b0d98d9d6ae454e58faf1015ea0f2898a6cf12"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "39f99fa935dc62d735f8880d1ac7a568cc581a539972ba99373f6e0182c5e88d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "dad1c052fbbd4ddf7104946ec8a45b288b1e9989388c3c7dd9cbeeaf181d1710"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "d14a5dc72c46fd51e129d8dde4955f035d99cf7bc4cc4893dc2cdf85bf2f14f1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "22f9048ef65f9ccb34c6f6a9971684200cec40af4bd91deaf89302795ab4e5a5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "c50b69ae59053a3017cb57d033b7a0402941c238763603cbe0510d362fae3c68"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "264066468598ba6d7cf85923fc67c1d29138563989311aa5712c342a6e395474"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "8bf0ee4efa3d4530c75d71ae6ef23c05b23e9979e54d75f65709d078bdc05f60"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e642a67fde1d4bb539704f401be2bfa15f8e1b39fbbfb73d7cee0d19c8685125"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "9d5c481457e3e914d8222eda6916e04df00c387cc9453db8dc261aabe6a374b6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "64d1a9db9a83fb6aefdf4f6ac0788306cf209641a42679853e85c6c04a9bc1c3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "fc25c3bea9b3d544faeb9263344405538fdd85d2584630f481c1ddee0143a692"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "76d38fbf632775b97e9fe9b2c56f0d5e3acab54a2e05021d2d883779ee904623"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 210104, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "efc22eede54ec10b7f5a762d37c253103bc59b9c9d81b7213f355b7829c6b7b0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "f48db719076069971132fb9402ad36a3c06461e391997f656ed57224d2bf2f8b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "399ad0f92005522ef7caa3045d172192a562a9d17fe9dcfce5339dae0cca9270"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "68ed2d6b8ce1d7e39d102968e4beec10db69166996afcd7d032781d648c494e1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "5c3725830145966f645e010f9ff5ece5904ddf6ab23fb149be68a88ed9bb6507"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 207368, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "c978bfe882892c1e473f2455c792e4d347f62fbc6de4f7894bac5445b57c3bc6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156880, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, true, "9ae4a9644e6f6a36086885ff001e7a9647c13a6967298893aace1bf9ac1a2074"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 2, 2, 0, 3, true, true, false, false, false, true, "9123ee4c6ecdd70600a96c39f30e2ce5ab6a6ee3d925cf5b6d4c92bb54717b89"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "98040d773fe1f39cc1b96eabdfd9637af9cf48ce08175fef114d8073be330204"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "d23a96a41d087c319faa4c098c6eed840015c296a7cc1ae0bc3882ecff46f9e5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "920085bf3f6e4f25e55fcc9ee4a828d001335ebd37b24caef173b07595b9adf3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "c5cefaf021024da91f8a4d075444132f63fb154dfb97e98ced17a59edd953b86"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "7ce178d5f56a468683976938308bba2ecb88a35329cb4b43d7cc93da09085816"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "106fe87e316ff41fddeed3525af8ade0cfb5ee359040fe032b7483d51a041ee3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "a4dee3fe7172fe28ad63dad71bdcaca5d7322846a4ad75e32f5dacae9f679f11"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 172544, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "b55fe40eb3b83dfc59847c6d3e7481be5fd8dc1476e453e4342539b7c6406bfd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, true, "ff22c1529c45e80c1440657360857cb5300460facdf230482a24e5acae4d24c6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 1, true, true, false, false, false, true, "53e64deeee292439b9e9304d506706ba485b42b722fa843a0c7fb1c6e6f7998b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "bc38bfac5b1bf10c5d0dcb3c079a73246424be98fb57b229fad09355f9bcca04"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "8675253da5874f8607320176e31e5b7e874a7ab506b7596fad41bab5ffd3cc00"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "a44c0d0bd92d3c1a0d4f80c597c2e01237f65111c5f0e62225d9a79e075b8b68"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "4b362e5a4dbb1d8c1ea49fce0dd030095247d3cee6ecb96174d1599db37efc58"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "ecc9430d1f026a59406c8dd3362fd8c63f68c2b209d59bf3044b0e39cc876164"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "cfb7068570fdce73f96bcd80b4df8c274becc92978a0c251f92fd2fd9b3d740d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "ff611ed4df53e90be7a17ef35a8e41b34d6542e15a2208774be241339aac45a0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "0791a5c4f8c5758270c502e72a4ed1140075592f3d2356eb5a720f8bed3bb33f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "3cc3ec3979197d17f818b90ed013a504371eb27aa7e538af086773ab590ceb46"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "a5cb95843349c16ed31bb2d749b520b625ce5063c04766b04c4f9ced804d5c95"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "2036250e6e536e6c8147ffe917573491ca5f8d689be1e5ac9e0c6c50ec8c9ec5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "01aefb9c42cc76d3fc56482fc1539bc5a2a3f78bcd829bb76083114fc8030485"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "0f7881e585d79791b84c7812f3c05456c514e9a34f491916c5d3b532deaf8fba"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "505b6bedd88bebaf26c3e2e82af722c55484230063fef56aa68be777b6a26d29"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "b492e939aa95dbd7f01c53e1b3e17a400e49b3feba891f03bdbb8032c9519cb1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "8f6f13dc14b356018af244e399add80b29ac6268b12556a0815729c222242c27"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "c2462eb4b30ac81e2b96b903552e7500b053668b8e5c4abcbedff69385e255bf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "35b7ab917a42dbadb9940dc7a9da7a1fdde9a28c9f4c0422c82794b546f76c0f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128PersistentSwapsAbForGen", 176816, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "ee0a72bdedb0d6e904498547c4f44082fc8ed125e0ad1cf3b8dd21718c1ec281"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ32Kv128StaticSwapsAbForGen", 172544, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "e2bb9d1531036f23cc93a3cf1fdd1ba65e0f3e3029658e7a70f8f78fcb2a1d4b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 165232, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, true, "71aec10ae5c01904be83d365a0a62e52a1164307c0929bf62643ee9100adcb22"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, true, "2a13d239e41a643aea7ee58e1c65af6f14bc05d9cf9e138bd6564a3db413bfb6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, true, "b9754a82003e4dbf1268070b0292e880e52bb9021893e6fe192202119fc2b01a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 0, true, true, false, false, false, true, "19989a904aab34a03ad0ffa9aba6d0ccfaaeb018f0f0974720274a3f89067be7"}, +#endif // EXCLUDE_SM_100F }; // clang-format on } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h index 681e9e0685..b858fa047f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h @@ -952,12 +952,12 @@ inline TllmGenFmhaKernel const* getTllmFmhaKernels( Data_type dtypeQ, Data_type dtypeKv, Data_type dtypeOut, unsigned int sm) { -#if !defined(EXCLUDE_SM_100) || !defined(EXCLUDE_SM_103) +#ifndef EXCLUDE_SM_100F return TllmFmhaKernelFactory::Get().getKernels(sTllmGenFmhaKernelMetaInfos, sizeof(sTllmGenFmhaKernelMetaInfos) / sizeof(sTllmGenFmhaKernelMetaInfos[0]), dtypeQ, dtypeKv, dtypeOut, sm); #else return nullptr; -#endif // EXCLUDE_SM_100 +#endif // EXCLUDE_SM_100F } } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/CMakeLists.txt b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/CMakeLists.txt index 671dab4d12..c5643ca536 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/CMakeLists.txt @@ -17,9 +17,14 @@ file(GLOB_RECURSE SRC_CPP *.cpp) file(GLOB_RECURSE SRC_CU *.cu) -filter_cuda_archs("100" SRC_CPP) -add_library(trtllm_gen_gemm OBJECT ${SRC_CPP} ${SRC_CU}) +add_library(trtllm_gen_gemm OBJECT) +filter_source_cuda_architectures( + SOURCE_LIST SRC_CPP + ARCHS 100 103 100f + TARGET trtllm_gen_gemm) + +target_sources(trtllm_gen_gemm PRIVATE ${SRC_CPP} ${SRC_CU}) target_compile_definitions(trtllm_gen_gemm PUBLIC TLLM_GEN_EXPORT_INTERFACE TLLM_ENABLE_CUDA) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h index 9824c9df37..50fe5f48f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h @@ -317,7 +317,7 @@ GemmConfig const* GemmInterface::getGemmConfigs() const size_t GemmInterface::getNumGemmConfigs() const { #ifdef TLLM_GEN_EXPORT_INTERFACE - return tensorrt_llm::kernels::tllmGenGemmListLen; + return sizeof(tensorrt_llm::kernels::tllmGenGemmList) / sizeof(tensorrt_llm::kernels::tllmGenGemmList[0]); #else return 0; #endif diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h index 28b2631635..90df35a927 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h @@ -31,31 +31,7 @@ namespace kernels #define TLLM_GEN_COMMIT "cb901a73-dirty" #define TLLM_GEN_EXPORT_VERSION "7.0" -static constexpr size_t tllmGenGemmListLen = 109; - -#ifndef EXCLUDE_SM_100 -extern unsigned char Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +#ifndef EXCLUDE_SM_100F extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin[]; extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin[]; extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100f_cubin[]; @@ -70,26 +46,6 @@ extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32 extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin[]; extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100f_cubin[]; extern unsigned char Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin[]; extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin[]; extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100f_cubin[]; @@ -103,28 +59,6 @@ extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin[]; extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin[]; extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100f_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin[]; extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin[]; extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100f_cubin[]; @@ -139,35 +73,83 @@ extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin[]; extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100f_cubin[]; extern unsigned char Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin[]; -extern unsigned char Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin[]; extern unsigned char Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin[]; extern unsigned char Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin[]; -#endif // EXCLUDE_SM_100 +#endif // EXCLUDE_SM_100F #ifndef EXCLUDE_SM_100 -extern unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned char Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin[]; +extern unsigned char Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +#endif // EXCLUDE_SM_100 + +#ifndef EXCLUDE_SM_103 +extern unsigned char Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin[]; +extern unsigned char Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin[]; +#endif // EXCLUDE_SM_103 + +#ifndef EXCLUDE_SM_100F extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin_len; extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin_len; extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100f_cubin_len; @@ -182,26 +164,6 @@ extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_ extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin_len; extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100f_cubin_len; extern unsigned int Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin_len; extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin_len; extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100f_cubin_len; @@ -215,28 +177,6 @@ extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1 extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin_len; extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin_len; extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100f_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin_len; extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin_len; extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100f_cubin_len; @@ -251,1555 +191,85 @@ extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1 extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin_len; extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100f_cubin_len; extern unsigned int Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin_len; -extern unsigned int Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len; extern unsigned int Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin_len; extern unsigned int Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin_len; +#endif // EXCLUDE_SM_100F + +#ifndef EXCLUDE_SM_100 +extern unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len; +extern unsigned int Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; #endif // EXCLUDE_SM_100 +#ifndef EXCLUDE_SM_103 +extern unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len; +extern unsigned int Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len; +#endif // EXCLUDE_SM_103 + static const gemm::GemmConfig tllmGenGemmList[] = { -#ifndef EXCLUDE_SM_100 -{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 150528, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "ef90d8459c870b5eaed737090a0839e43ab9b1979b8c55b650bb7f1fb2ef51a8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 1 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ std::nullopt -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len, 150528, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a", 320, "43d51e76383ac46485f92f7fecb2e8caddd1d2c9e8f10e0bafb916268118ab71", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 1 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ std::nullopt -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "37a5775337ff0a8b330b1ccd09eb178bd454e714487f9abc2c6b932c3bf953fb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 141312, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "b4abc63c952b8cc92ffc0c7b11be86902687f745a840d42d54ad2be5ff44c784", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "0aedd71335551f1a9c341366d31aa8ffeab1ad556ebc5dfbb80dad0e95c0b33e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 141312, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "d6f7922ac084df6e8e1bd372bbd7c25787bcce8903da3cccc92f237fff7d3b4d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "b37d24294830c8f6855304535349059ebf843c500ea2ab9c59ea36f8f052e8ac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "c3251a57acab6b10ea7bae87da99a1f5607f8f55decd5821b1131988095bb651", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "42027582fe2df190b57b14576111bf776d477585c30dd4c8ed9ae261d02420b3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "fe6415d35a3ac5797a0ee26a4d460e3c8cff7b4e254ca72e05bed5e449edf0fe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "c3b7eb7037b6b1a3da7f86b381f4952b46ecb8fee0afd6c5121a2fd10e0b3b12", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "5b806c399c45dfa1859416c11712e2f546c016d6f5c31b46ec6d9a3333ed48db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "f80bc152541ec413005bac03dcccc3687eb4d4da24f0f6a647b8888665e584e2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "da5a3e58961302e97283e26a3dfd50e3177109e556d1e8c14822cbe709f63f4f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "5a674a32baeffc1093f8a76334458bbe482d6e701b6333388c2411f42856b462", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "c73e49bdf2aa8f04758f9492dd26abf1a7acea812c1f128c58b903d8e4878ad0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "09406be5b805f2a09f947796794cc6114c077e5d8b78cf9b88d15cb3d936ceb4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "b58d149aa8c6bea95da05afc0b0f60b5a9dc7b222871abb0989ab8f32f1a3f55", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "c102250f3b05e29eb10a84a236e5fb3e40ed798d442f840396bbd0de11514d0e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "4ace2312bc8ed97281bb47fb094a4872e220d8fd5f10571b26e61e396f7ba4a4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "34f0007c4c702045f3af86d4e35969eecfd8ddd393818243e7cd4b360896d8bb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "07e9b5fef206d41ad09a69e670699bcd8b3c601673a57e28bce3cc07e5d77de3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, +#ifndef EXCLUDE_SM_100F {Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin_len, 175104, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f", 416, "dea55bdf170fc6467274bebcf4307d20c5bc496238e7a8503a90d4553548cd43", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -2780,1406 +1250,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mWorldSize */ 1 }, gemm::SmVersion::Sm100f}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 124928, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "3bf456c392699f5ea11c4dbb5c4b68f6c2c284d3ef2379972a53721a15b0b654", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 124928, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "42312d539cc920bef178822c0352df9ad2d8bd3211440565061e762eaadf6cee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 124928, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "da2c25dbdbe1cad8ad0da45995aea8e1319f834c7148c026eef3cf2cb7d0059a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 124928, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "30c714e48bb7b16b59f0f9d35ac3295bba47cacfd9dd69ad58489dac3d0af54c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "fb64a22c73ddb482832b7f6835561ff64580babc506359be77c4fed648f36745", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "72c2574a1a7dbf04739adc56f9e4dc88cf2b411c082b438c62b692877fb88427", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "2fd2af44f3d0a8c9717a473ca531ceea5c2eed6febc920e2896d3e93e4c3f326", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "3f903ab26b76e3b40812a5cf5f6c2399d8633f828f6fa15eb483c2841e5b73d8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "b63dee9a4f3f1fd87dd214fdf34453af0de0839585ac5f4f53c4785c02c272a1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "ea3cb0e433e9adc7ca5ef2d0d2fd85e0f0a79e7af6a93d320aa632ef96264a64", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "6b0fbc9d1ae53866911a0d2d6fcb7f8f141d234d9156b4aee0e85436789df5e0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "4621c71647fea809f2d6154bab8d465d6a106b843db7b903916cd63bb17d792a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "20dac3cbfa097ff056db9b0a27a3678cf82dabc9342f84cf810882a4c15d2ed2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "2713f3b3e8d3caf85b7764d1d8ae6ffca410584f46752c37d8b31985c048c939", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "03d9c2cfb6fb42e1f02e44dbcbc7bda901681dcc4c0609fb3989250082e4061e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "296994cb70cc9542dee7c38e7a7da3a7740a5a20e6f34d45e0e1adba64b96d25", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "7fc53732979eff7cddb888495edc7245d839281c4caabc994c462fc78a2c2b8c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "81a81d92172fa7cf1147cc7c1211863aa848cf720897a877506671b446679982", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "8374111249d6249c4ebfaa9ecb2e0fe0ba3b0a276b05997421f29ba33ea3bbe4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "811e4f06c0680a4bc68433e142a36725232ec926cbe8e6dc244b77f7d3376b72", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, {Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin_len, 158720, "gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f", 416, "049686d936f30c2878246522a039cd9b8a2e31c07e95957f3ca3c4785b171e6c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -5090,1546 +2160,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mWorldSize */ 1 }, gemm::SmVersion::Sm100f}, -{Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 150528, "gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "c4911ee078a171f0d18a33ed002b9a160d2ad094065a35db07f532dd0a1804bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 1 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ std::nullopt -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin, Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len, 150528, "gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a", 320, "408958f4aaed2d547a5f64dd65242db94cc0dbaa984786f75d9dd8d25c5e1c7b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 1 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ std::nullopt -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "3c27f416cc42d36dc2f709079d063929c20e145750205723cab3c3959945f1d9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 141312, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "405da92fd8006ab1611b733e5644296a37df850dba715fa2ca9c88698a253d42", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "5a56cb78ebad1aa89e7cf5ce6571411904b2ed4ff5c5a6d63ad95e2f83a8d745", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 141312, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "0732c66f96b87a168702fedb2b5a2c810e803b834af7016f5bdc83e41b0bb59b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "227ee209841a0259a34adaae7777af1f7b5b54a6a9b4c21c5a5d753162a2a2ed", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "0ca9acbdae5defe2213c77124bbe6bd46ff1b340aaa84809d7c36017d6a81c9f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "71d6913e251937261256b569f3e7b04f5bd12b8ef10c0be7631100520a7e0d0c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "3ad16108e3190414a1ae268e035f4ea4c251b6c598cf1e2c6637a91bd145b66d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "e62b2f9afdc741860cdb0b201c98881ba2eaff208fc4ecafa573c5286e03f7ef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "1b9fe74a9f80a5eadfb6cda1bb0cfdd719969280a98bc5efd76afbfcc17dd241", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "4b6d31dbda41fc6a7254b1d1f35ee8f2d78487a0b5324825bb286841174710cc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "01ac1972c8619dae6f5f9cb2ea483dc74787c9a4a1fd061a4c1da0a536495313", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "d8758677ee9a0b7a691cb99b9266f6c6ef623b443c4cebc623280cea1d89c3a7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "02ffcbff39cec36b1b86fe850f1a045c370b91abcee962ae740c7acec9089585", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "3f282f89b4eb6807b1443acaa05704158066313387f58ee3962c3d771e04a994", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "5155c850b832898fa16c230c7ffb46d4ad58e5cfb262537233a974e6fbb0b829", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "b813d8ad3660fc9fd8abe190cc61e54b7b0da6b2929a77fef07e46f40cdfbe93", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "10a6feb413aa72adf40b177dc035856527e4197ad1d441de0b28b4ed4296a4a1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "e15965ca77119094943368073578bcab1e5d183242dfae138e21c4309bd76520", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "90883a408e93752ca123fc9ca8911e9838c46fa74fcc8659c8d723c38da3169c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 32 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, {Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f_cubin_len, 175104, "gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100f", 416, "04bd1a1ba2fe8048418649f0c900e2c92ab70a90550a3e4741db94a2c8eb08b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -7610,146 +3140,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mWorldSize */ 1 }, gemm::SmVersion::Sm100f}, -{Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 183296, "gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "c93a7b367da32b1b13946b567e3f92ff050f67ed4facf0bf77e1e226b30a2224", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1056776) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 1 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ std::nullopt -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin, Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len, 183296, "gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a", 320, "44d282f6dd676fc5af3493552900b2051f655e7640de609c8489d0687c1f0bd9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1056776) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 -, /* mGridTriggerSecondaryA */ 1 -, /* mGridTriggerSecondaryB */ 0 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 -, /* mGridWaitForPrimaryB */ 0 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 128 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ std::nullopt -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mWorldSize */ 1 - }, gemm::SmVersion::Sm103a}, {Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin, Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f_cubin_len, 227328, "gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100f", 448, "1eb81ec19bfcecf7a5d37af9cfd00c8453b8a4d1b8ee676bc11da603e6207aee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -7890,7 +3280,4633 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mWorldSize */ 1 }, gemm::SmVersion::Sm100f}, +#endif // EXCLUDE_SM_100F + +#ifndef EXCLUDE_SM_100 +{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 150528, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "ef90d8459c870b5eaed737090a0839e43ab9b1979b8c55b650bb7f1fb2ef51a8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "37a5775337ff0a8b330b1ccd09eb178bd454e714487f9abc2c6b932c3bf953fb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "0aedd71335551f1a9c341366d31aa8ffeab1ad556ebc5dfbb80dad0e95c0b33e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "b37d24294830c8f6855304535349059ebf843c500ea2ab9c59ea36f8f052e8ac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "42027582fe2df190b57b14576111bf776d477585c30dd4c8ed9ae261d02420b3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "c3b7eb7037b6b1a3da7f86b381f4952b46ecb8fee0afd6c5121a2fd10e0b3b12", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "f80bc152541ec413005bac03dcccc3687eb4d4da24f0f6a647b8888665e584e2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "5a674a32baeffc1093f8a76334458bbe482d6e701b6333388c2411f42856b462", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "09406be5b805f2a09f947796794cc6114c077e5d8b78cf9b88d15cb3d936ceb4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "c102250f3b05e29eb10a84a236e5fb3e40ed798d442f840396bbd0de11514d0e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "34f0007c4c702045f3af86d4e35969eecfd8ddd393818243e7cd4b360896d8bb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 124928, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "3bf456c392699f5ea11c4dbb5c4b68f6c2c284d3ef2379972a53721a15b0b654", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 124928, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "da2c25dbdbe1cad8ad0da45995aea8e1319f834c7148c026eef3cf2cb7d0059a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "fb64a22c73ddb482832b7f6835561ff64580babc506359be77c4fed648f36745", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "2fd2af44f3d0a8c9717a473ca531ceea5c2eed6febc920e2896d3e93e4c3f326", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "b63dee9a4f3f1fd87dd214fdf34453af0de0839585ac5f4f53c4785c02c272a1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "6b0fbc9d1ae53866911a0d2d6fcb7f8f141d234d9156b4aee0e85436789df5e0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "20dac3cbfa097ff056db9b0a27a3678cf82dabc9342f84cf810882a4c15d2ed2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "03d9c2cfb6fb42e1f02e44dbcbc7bda901681dcc4c0609fb3989250082e4061e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "7fc53732979eff7cddb888495edc7245d839281c4caabc994c462fc78a2c2b8c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "8374111249d6249c4ebfaa9ecb2e0fe0ba3b0a276b05997421f29ba33ea3bbe4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 150528, "gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "c4911ee078a171f0d18a33ed002b9a160d2ad094065a35db07f532dd0a1804bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "3c27f416cc42d36dc2f709079d063929c20e145750205723cab3c3959945f1d9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a_cubin_len, 141312, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm100a", 416, "5a56cb78ebad1aa89e7cf5ce6571411904b2ed4ff5c5a6d63ad95e2f83a8d745", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "227ee209841a0259a34adaae7777af1f7b5b54a6a9b4c21c5a5d753162a2a2ed", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 109568, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "71d6913e251937261256b569f3e7b04f5bd12b8ef10c0be7631100520a7e0d0c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "e62b2f9afdc741860cdb0b201c98881ba2eaff208fc4ecafa573c5286e03f7ef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 142336, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "4b6d31dbda41fc6a7254b1d1f35ee8f2d78487a0b5324825bb286841174710cc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "d8758677ee9a0b7a691cb99b9266f6c6ef623b443c4cebc623280cea1d89c3a7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 207872, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "3f282f89b4eb6807b1443acaa05704158066313387f58ee3962c3d771e04a994", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "b813d8ad3660fc9fd8abe190cc61e54b7b0da6b2929a77fef07e46f40cdfbe93", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a_cubin_len, 93184, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm100a", 416, "e15965ca77119094943368073578bcab1e5d183242dfae138e21c4309bd76520", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 183296, "gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "c93a7b367da32b1b13946b567e3f92ff050f67ed4facf0bf77e1e226b30a2224", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm100a}, #endif // EXCLUDE_SM_100 + +#ifndef EXCLUDE_SM_103 +{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len, 150528, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a", 320, "43d51e76383ac46485f92f7fecb2e8caddd1d2c9e8f10e0bafb916268118ab71", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 141312, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "b4abc63c952b8cc92ffc0c7b11be86902687f745a840d42d54ad2be5ff44c784", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 141312, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "d6f7922ac084df6e8e1bd372bbd7c25787bcce8903da3cccc92f237fff7d3b4d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "c3251a57acab6b10ea7bae87da99a1f5607f8f55decd5821b1131988095bb651", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "fe6415d35a3ac5797a0ee26a4d460e3c8cff7b4e254ca72e05bed5e449edf0fe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "5b806c399c45dfa1859416c11712e2f546c016d6f5c31b46ec6d9a3333ed48db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "da5a3e58961302e97283e26a3dfd50e3177109e556d1e8c14822cbe709f63f4f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "c73e49bdf2aa8f04758f9492dd26abf1a7acea812c1f128c58b903d8e4878ad0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "b58d149aa8c6bea95da05afc0b0f60b5a9dc7b222871abb0989ab8f32f1a3f55", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "4ace2312bc8ed97281bb47fb094a4872e220d8fd5f10571b26e61e396f7ba4a4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "07e9b5fef206d41ad09a69e670699bcd8b3c601673a57e28bce3cc07e5d77de3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 124928, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "42312d539cc920bef178822c0352df9ad2d8bd3211440565061e762eaadf6cee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 124928, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "30c714e48bb7b16b59f0f9d35ac3295bba47cacfd9dd69ad58489dac3d0af54c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "72c2574a1a7dbf04739adc56f9e4dc88cf2b411c082b438c62b692877fb88427", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "3f903ab26b76e3b40812a5cf5f6c2399d8633f828f6fa15eb483c2841e5b73d8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "ea3cb0e433e9adc7ca5ef2d0d2fd85e0f0a79e7af6a93d320aa632ef96264a64", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "4621c71647fea809f2d6154bab8d465d6a106b843db7b903916cd63bb17d792a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "2713f3b3e8d3caf85b7764d1d8ae6ffca410584f46752c37d8b31985c048c939", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "296994cb70cc9542dee7c38e7a7da3a7740a5a20e6f34d45e0e1adba64b96d25", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "81a81d92172fa7cf1147cc7c1211863aa848cf720897a877506671b446679982", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "811e4f06c0680a4bc68433e142a36725232ec926cbe8e6dc244b77f7d3376b72", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin, Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len, 150528, "gemm_Fp16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a", 320, "408958f4aaed2d547a5f64dd65242db94cc0dbaa984786f75d9dd8d25c5e1c7b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 141312, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "405da92fd8006ab1611b733e5644296a37df850dba715fa2ca9c88698a253d42", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a_cubin_len, 141312, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedPx3_sm103a", 416, "0732c66f96b87a168702fedb2b5a2c810e803b834af7016f5bdc83e41b0bb59b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "0ca9acbdae5defe2213c77124bbe6bd46ff1b340aaa84809d7c36017d6a81c9f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 109568, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x512u2_s2_et128x16_m128x16x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "3ad16108e3190414a1ae268e035f4ea4c251b6c598cf1e2c6637a91bd145b66d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "1b9fe74a9f80a5eadfb6cda1bb0cfdd719969280a98bc5efd76afbfcc17dd241", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 142336, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "01ac1972c8619dae6f5f9cb2ea483dc74787c9a4a1fd061a4c1da0a536495313", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "02ffcbff39cec36b1b86fe850f1a045c370b91abcee962ae740c7acec9089585", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 207872, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "5155c850b832898fa16c230c7ffb46d4ad58e5cfb262537233a974e6fbb0b829", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "10a6feb413aa72adf40b177dc035856527e4197ad1d441de0b28b4ed4296a4a1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin, Gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a_cubin_len, 93184, "gemm_Fp16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s2_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedPx3_sm103a", 416, "90883a408e93752ca123fc9ca8911e9838c46fa74fcc8659c8d723c38da3169c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 32 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +{Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin, Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a_cubin_len, 183296, "gemm_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm103a", 320, "44d282f6dd676fc5af3493552900b2051f655e7640de609c8489d0687c1f0bd9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 128 +, /* mGridTriggerSecondaryA */ 1 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 0 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ std::nullopt +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mWorldSize */ 1 + }, gemm::SmVersion::Sm103a}, +#endif // EXCLUDE_SM_103 }; // clang-format on } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h index 0e528a7774..852295ceb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h @@ -537,7 +537,7 @@ public: public: // The MMA kind. - tg::MmaKind mMmaKind; + tg::MmaKind mMmaKind{}; // Helper for SMEM allocation. MemAllocatorHelper mSmemAllocatorHelper; // Helper for TMEM allocation. diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/CMakeLists.txt b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/CMakeLists.txt index 78e7b4d85c..59d21f4678 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/CMakeLists.txt @@ -18,9 +18,13 @@ file(GLOB_RECURSE SRC_CPP *.cpp) file(GLOB_RECURSE SRC_CU *.cu) -filter_cuda_archs("100" SRC_CPP) +add_library(trtllm_gen_gemm_gated_act OBJECT) +filter_source_cuda_architectures( + SOURCE_LIST SRC_CPP + ARCHS 100f + TARGET trtllm_gen_gemm_gated_act) -add_library(trtllm_gen_gemm_gated_act OBJECT ${SRC_CPP} ${SRC_CU}) +target_sources(trtllm_gen_gemm_gated_act PRIVATE ${SRC_CPP} ${SRC_CU}) target_compile_definitions(trtllm_gen_gemm_gated_act PUBLIC TLLM_GEN_EXPORT_INTERFACE TLLM_ENABLE_CUDA) set_property(TARGET trtllm_gen_gemm_gated_act PROPERTY POSITION_INDEPENDENT_CODE diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h index ff75b0f30a..8153d8c971 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h @@ -316,7 +316,8 @@ GemmGatedActConfig const* GemmGatedActInterface::getGemmConfigs() const size_t GemmGatedActInterface::getNumGemmConfigs() const { #ifdef TLLM_GEN_EXPORT_INTERFACE - return tensorrt_llm::kernels::tllmGenGemmGatedActListLen; + return sizeof(tensorrt_llm::kernels::tllmGenGemmGatedActList) + / sizeof(tensorrt_llm::kernels::tllmGenGemmGatedActList[0]); #else return 0; #endif diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h index c52dd9f29f..4c7ef58031 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h @@ -33,7 +33,7 @@ namespace kernels static constexpr size_t tllmGenGemmGatedActListLen = 13; -#ifndef EXCLUDE_SM_100 +#ifndef EXCLUDE_SM_100F extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin[]; extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f_cubin[]; extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f_cubin[]; @@ -47,9 +47,9 @@ extern unsigned char GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et1 extern unsigned char GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f_cubin[]; extern unsigned char GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin[]; extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin[]; -#endif // EXCLUDE_SM_100 +#endif // EXCLUDE_SM_100F -#ifndef EXCLUDE_SM_100 +#ifndef EXCLUDE_SM_100F extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin_len; extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f_cubin_len; extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f_cubin_len; @@ -63,11 +63,11 @@ extern unsigned int GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et12 extern unsigned int GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f_cubin_len; extern unsigned int GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin_len; extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin_len; -#endif // EXCLUDE_SM_100 +#endif // EXCLUDE_SM_100F static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { -#ifndef EXCLUDE_SM_100 +#ifndef EXCLUDE_SM_100F {GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin_len, 86016, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f", 448, "9676717a6339c1f0ed39de9935975d4d60468440174326380990d70d75cf357a", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -138,7 +138,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f_cubin_len, 168960, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f", 224, "ec67e15fd7862d51b19dda2176f8129e4592a748eefb3e48b6a3fbc41868d6f2", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -209,7 +209,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f_cubin_len, 112640, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f", 224, "caa8817305ecc3e91818c767f6a7989b7db55b6c49232c9a2e32d1c907228684", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -280,7 +280,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin_len, 110592, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f", 224, "48fc7f954cbb918fab79ff2ee54159c00da0687ee4ca60036b9fe5c746d97a2c", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -351,7 +351,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin_len, 86016, "gemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f", 448, "0d61dc1cfefc5ca6d5e90afa70347394930cbf4c79eea20c1518de8e1222da54", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -422,7 +422,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f_cubin_len, 152576, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f", 224, "403cbe5700ce7b2749b49c595661ceeedbf6e50c580709a37a610cc54a239b4d", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -493,7 +493,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f_cubin_len, 111616, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f", 224, "c2d4a41c6e16594d2c46fece2ee0a15e783d4c914bf698c4b7fd7ac0c14c2dfc", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -564,7 +564,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin_len, 110592, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f", 224, "20c18093aeca9040c7914535312f7980bf9cfbb77430dc0ef9660d7a8b2f3340", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -635,7 +635,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin_len, 86016, "gemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f", 448, "375daed8ff4ada0d0117299a9fd882c0598feeff34924a9d1479b1bc2a0c473e", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -706,7 +706,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f_cubin_len, 168960, "gemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100f", 224, "d0abb1065fc2517d1208f38e0716cf6fa177a165ae5c560033fd2e1e81bffe16", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -777,7 +777,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f_cubin_len, 112640, "gemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100f", 224, "22fefb0cbe12f8af7719f7dd3e9271d7386f20e57d8dd2809587d0e825d13681", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -848,7 +848,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin_len, 110592, "gemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f", 224, "48101b606e52817efdcfc9a17ad91b224fa8fd6c36283f5f370b8f6ae729fadc", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -919,7 +919,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100f}, {GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f_cubin_len, 86016, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100f", 448, "0e648979b852a9c612006fa871fc73dc427ceacbd79201c7d1b4e52a0d64aec7", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 @@ -990,8 +990,8 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -#endif // EXCLUDE_SM_100 + }, gemm::SmVersion::Sm100f}, +#endif // EXCLUDE_SM_100F }; // clang-format on } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h index 971d8beaea..8097c2ad36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h @@ -537,7 +537,7 @@ public: public: // The MMA kind. - tg::MmaKind mMmaKind; + tg::MmaKind mMmaKind{}; // Helper for SMEM allocation. MemAllocatorHelper mSmemAllocatorHelper; // Helper for TMEM allocation.